commit b0fa1d2aeadecbcba10ef90cf467c835aef693b1 Author: Pavan Deolasee Date: Mon Sep 5 09:26:11 2016 +0530 Add support for WARM (Write Amplification Reduction Method) We have used HOT updates to handle the cases when the UPDATE does not change any index column. In such cases, we could avoid inserting duplicate index entries, which not only reduces index bloat, but also makes it easier to clean up dead tuples without requiring to visit the indexes. But when an UPDATE changes an index column, we insert duplicate entries in all indexes. This results in write amplification especially for tables with large number of indexes. WARM takes it further by now avoiding duplicate index entries for indexes whose columns are not being updated. What we do is only insert new entries those indexes which have changed, but instead of indexing the actual TID of the new tuple, we index the root line pointer of the HOT chain. As a side effect, for correctness we now must verify that the index is pointing to a tuple which really satisifes the index key. Each index AM must implement an amrecheck method which returns true iff the index key constructed from the given heap tuple matches the given index key. The patch currently works with several restrictions: 1. WARM updates on system tables are disabled. While we disabled them for ease of development, there could be some issues with system tables because they apparently do not support lossy indexes. 2. Only one WARM update per HOT chain is allowed. That seems very restrictive, but even that should reduce index bloat by 50%. Subsequently, we will optimise this by either allowing multiple WARM updates or by turning WARM chains to HOT chains as and when tuples retire. 3. Expression and partial indexes don't work with WARM updates. For expression indexes, we will need to find a way to determine if the old and new tuple computes to the same index expression and avoid adding a duplicate index entry in such cases. This is not only required to avoid unnecessary index bloat, but also for correctness purposes. Similarly, for partial indexes, we must index the new entry if the old tuple did not satisfy the predicate but the new one does. 4. If table has an index which does not support amrecheck method, WARM is disabled on such tables. diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index debf4f4..d49d179 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -138,6 +138,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->amendscan = blendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 1b45a4c..ba3fffb 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -111,6 +111,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amendscan = brinendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index f7f44b4..813c2c3 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -88,6 +88,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amendscan = gistendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index e3b1eef..d7c50c1 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -85,6 +85,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amendscan = hashendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = hashrecheck; PG_RETURN_POINTER(amroutine); } @@ -265,6 +266,8 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; ItemPointer current; bool res; + IndexTuple itup; + /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; @@ -302,8 +305,6 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { - IndexTuple itup; - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 4825558..cf44214 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -59,6 +59,8 @@ _hash_next(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; return true; } @@ -263,6 +265,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; + if (scan->xs_want_itup) + scan->xs_itup = itup; + return true; } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index 822862d..71377ab 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -17,8 +17,12 @@ #include "access/hash.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" +#include "executor/executor.h" +#include "nodes/execnodes.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/datum.h" /* @@ -352,3 +356,110 @@ _hash_binsearch_last(Page page, uint32 hash_value) return lower; } + +/* + * Recheck if the heap tuple satisfies the key stored in the index tuple + */ +bool +hashrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + IndexInfo *indexInfo; + EState *estate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum values2[INDEX_MAX_KEYS]; + bool isnull2[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + indexInfo = BuildIndexInfo(indexRel); + + /* + * The heap tuple must be put into a slot for FormIndexDatum. + */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); + + ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); + + /* + * Typically the index won't have expressions, but if it does we need an + * EState to evaluate them. We need it for exclusion constraints too, + * even if they are just on simple columns. + */ + if (indexInfo->ii_Expressions != NIL || + indexInfo->ii_ExclusionOps != NULL) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + } + else + estate = NULL; + + /* + * Form the index values and isnull flags for the index entry that we need + * to check. + * + * Note: if the index uses functions that are not as immutable as they are + * supposed to be, this could produce an index tuple different from the + * original. The index AM can catch such errors by verifying that it + * finds a matching index entry with the tuple's TID. For exclusion + * constraints we check this in check_exclusion_constraint(). + */ + FormIndexDatum(indexInfo, slot, estate, values, isnull); + + /* + * HASH indexes compute a hash value of the key and store that in the + * index. So we must first obtain the hash of the value obtained from the + * heap and then do a comparison + */ + _hash_convert_tuple(indexRel, values, isnull, values2, isnull2); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL then they are equal + */ + if (isnull2[i - 1] && indxisnull) + continue; + + /* + * If either is NULL then they are not equal + */ + if (isnull2[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now do a raw memory comparison + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values2[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + if (estate != NULL) + FreeExecutorState(estate); + + ExecDropSingleTupleTableSlot(slot); + + return equal; + +} diff --git a/src/backend/access/heap/README.WARM b/src/backend/access/heap/README.WARM new file mode 100644 index 0000000..f793570 --- /dev/null +++ b/src/backend/access/heap/README.WARM @@ -0,0 +1,271 @@ +src/backend/access/heap/README.WARM + +Write Amplification Reduction Method (WARM) +=========================================== + +The Heap Only Tuple (HOT) feature greatly eliminated redudant index +entries and allowed re-use of the dead space occupied by previously +updated or deleted tuples (see src/backend/access/heap/README.HOT) + +One of the necessary conditions for satisfying HOT update is that the +update must not change a column used in any of the indexes on the table. +The condition is sometimes hard to meet, especially for complex +workloads with several indexes on large yet frequently updated tables. +Worse, sometimes only one or two index columns may be updated, but the +regular non-HOT update will still insert a new index entry in every +index on the table, irrespective of whether the key pertaining to the +index changed or not. + +WARM is a technique devised to address these problems. + + +Update Chains With Multiple Index Entries Pointing to the Root +-------------------------------------------------------------- + +When a non-HOT update is caused by an index key change, a new index +entry must be inserted for the changed index. But if the index key +hasn't changed for other indexes, we don't really need to insert a new +entry. Even though the existing index entry is pointing to the old +tuple, the new tuple is reachable via the t_ctid chain. To keep things +simple, a WARM update requires that the heap block must have enough +space to store the new version of the tuple. This is same as HOT +updates. + +In WARM, we ensure that every index entry always points to the root of +the WARM chain. In fact, a WARM chain looks exactly like a HOT chain +except for the fact that there could be multiple index entries pointing +to the root of the chain. So when new entry is inserted in an index for +updated tuple, and if we are doing a WARM update, the new entry is made +point to the root of the WARM chain. + +For example, if we have a table with two columns and two indexes on each +of the column. When a tuple is first inserted the table, we have exactly +one index entry pointing to the tuple from both indexes. + + lp [1] + [1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's entry (aaaa) also points to 1 + +Now if the tuple's second column is updated and if there is room on the +page, we perform a WARM update. To do so, Index1 does not get any new +entry and Index2's new entry will still point to the root tuple of the +chain. + + lp [1] [2] + [1111, aaaa]->[111, bbbb] + + Index1's entry (1111) points to 1 + Index2's old entry (aaaa) points to 1 + Index2's new entry (bbbb) also points to 1 + +"A update chain which has more than one index entries pointing to its +root line pointer is called WARM chain and the action that creates a +WARM chain is called WARM update." + +Since all indexes always point to the root of the WARM chain, even when +there are more than one index entries, WARM chains can be pruned and +dead tuples can be removed without a need to do corresponding index +cleanup. + +While this solves the problem of pruning dead tuples from a HOT/WARM +chain, it also opens up a new technical challenge because now we have a +situation where a heap tuple is reachable from multiple index entries, +each having a different index key. While MVCC still ensures that only +valid tuples are returned, a tuple with a wrong index key may be +returned because of wrong index entries. In the above example, tuple +[1111, bbbb] is reachable from both keys (aaaa) as well as (bbbb). For +this reason, tuples returned from a WARM chain must always be rechecked +for index key-match. + +Recheck Index Key Againt Heap Tuple +----------------------------------- + +Since every Index AM has it's own notion of index tuples, each Index AM +must implement its own method to recheck heap tuples. For example, a +hash index stores the hash value of the column and hence recheck routine +for hash AM must first compute the hash value of the heap attribute and +then compare it against the value stored in the index tuple. + +The patch currently implement recheck routines for hash and btree +indexes. If the table has an index which doesn't support recheck +routine, WARM updates are disabled on such tables. + +Problem With Duplicate (key, ctid) Index Entries +------------------------------------------------ + +The index-key recheck logic works as long as there are no duplicate +index keys, both pointing to the same WARM chain. In that case, the same +valid tuple will be reachable via multiple index keys, yet satisfying +the index key checks. In the above example, if the tuple [1111, bbbb] is +again updated to [1111, aaaa] and if we insert a new index entry (aaaa) +pointing to the root line pointer, we will end up with the following +structure: + + lp [1] [2] [3] + [1111, aaaa]->[1111, bbbb]->[1111, aaaa] + + Index1's entry (1111) points to 1 + Index2's oldest entry (aaaa) points to 1 + Index2's old entry (bbbb) also points to 1 + Index2's new entry (aaaa) also points to 1 + +We must solve this problem to ensure that the same tuple is not +reachable via multiple index pointers. There are couple of ways to +address this issue: + +1. Do not allow WARM update to a tuple from a WARM chain. This +guarantees that there can never be duplicate index entries to the same +root line pointer because we must have checked for old and new index +keys while doing the first WARM update. + +2. Do not allow duplicate (key, ctid) index pointers. In the above +example, since (aaaa, 1) already exists in the index, we must not insert +a duplicate index entry. + +The patch currently implements 1 i.e. do not do WARM updates to a tuple +from a WARM chain. HOT updates are fine because they do not add a new +index entry. + +Even with the restriction, this is a significant improvement because the +number of regular UPDATEs are curtailed down to half. + +Expression and Partial Indexes +------------------------------ + +Expressions may evaluate to the same value even if the underlying column +values have changed. A simple example is an index on "lower(col)" which +will return the same value if the new heap value only differs in the +case sensitivity. So we can not solely rely on the heap column check to +decide whether or not to insert a new index entry for expression +indexes. Similarly, for partial indexes, the predicate expression must +be evaluated to decide whether or not to cause a new index entry when +columns referred in the predicate expressions change. + +(None of these things are currently implemented and we squarely disallow +WARM update if a column from expression indexes or predicate has +changed). + + +Efficiently Finding the Root Line Pointer +----------------------------------------- + +During WARM update, we must be able to find the root line pointer of the +tuple being updated. It must be noted that the t_ctid field in the heap +tuple header is usually used to find the next tuple in the update chain. +But the tuple that we are updating, must be the last tuple in the update +chain. In such cases, the c_tid field usually points the tuple itself. +So in theory, we could use the t_ctid to store additional information in +the last tuple of the update chain, if the information about the tuple +being the last tuple is stored elsewhere. + +We now utilize another bit from t_infomask2 to explicitly identify that +this is the last tuple in the update chain. + +HEAP_LATEST_TUPLE - When this bit is set, the tuple is the last tuple in +the update chain. The OffsetNumber part of t_ctid points to the root +line pointer of the chain when HEAP_LATEST_TUPLE flag is set. + +If UPDATE operation is aborted, the last tuple in the update chain +becomes dead. The root line pointer information stored in the tuple +which remains the last valid tuple in the chain is also lost. In such +rare cases, the root line pointer must be found in a hard way by +scanning the entire heap page. + +Tracking WARM Chains +-------------------- + +The old and every subsequent tuple in the chain is marked with a special +HEAP_WARM_TUPLE flag. We use the last remaining bit in t_infomask2 to +store this information. + +When a tuple is returned from a WARM chain, the caller must do +additional checks to ensure that the tuple matches the index key. Even +if the tuple comes precedes the WARM update in the chain, it must still +be rechecked for the index key match (case when old tuple is returned by +the new index key). So we must follow the update chain everytime to the +end to see check if this is a WARM chain. + +When the old updated tuple is retired and the root line pointer is +converted into a redirected line pointer, we can copy the information +about WARM chain to the redirected line pointer by storing a special +value in the lp_len field of the line pointer. This will handle the most +common case where a WARM chain is replaced by a redirect line pointer +and a single tuple in the chain. + +Converting WARM chains back to HOT chains (VACUUM ?) +---------------------------------------------------- + +The current implementation of WARM allows only one WARM update per +chain. This simplifies the design and addresses certain issues around +duplicate scans. But this also implies that the benefit of WARM will be +no more than 50%, which is still significant, but if we could return +WARM chains back to normal status, we could do far more WARM updates. + +A distinct property of a WARM chain is that at least one index has more +than one live index entries pointing to the root of the chain. In other +words, if we can remove duplicate entry from every index or conclusively +prove that there are no duplicate index entries for the root line +pointer, the chain can again be marked as HOT. + +Here is one idea: + +A WARM chain has two parts, separated by the tuple that caused WARM +update. All tuples in each part has matching index keys, but certain +index keys may not match between these two parts. Lets say we mark heap +tuples in each part with a special Red-Blue flag. The same flag is +replicated in the index tuples. For example, when new rows are inserted +in a table, they are marked with Blue flag and the index entries +associated with those rows are also marked with Blue flag. When a row is +WARM updated, the new version is marked with Red flag and the new index +entry created by the update is also marked with Red flag. + + +Heap chain: [1] [2] [3] [4] + [aaaa, 1111]B -> [aaaa, 1111]B -> [bbbb, 1111]R -> [bbbb, 1111]R + +Index1: (aaaa)B points to 1 (satisfies only tuples marked with B) + (bbbb)R points to 1 (satisfies only tuples marked with R) + +Index2: (1111)B points to 1 (satisfied bith B and R tuples) + + +It's clear that for indexes with Red and Blue pointers, a heap tuple +with Blue flag will be reachable from Blue pointer and that with Red +flag will be reachable from Red pointer. But for indexes which did not +create a new entry, both Blue and Red tuples will be reachable from Blue +pointer (there is no Red pointer in such indexes). So, as a side note, +matching Red and Blue flags is not enough from index scan perspective. + +During first heap scan of VACUUM, we look for tuples with +HEAP_WARM_TUPLE set. If all live tuples in the chain are either marked +with Blue flag or Red flag (but no mix of Red and Blue), then the chain +is a candidate for HOT conversion. We remember the root line pointer +and Red-Blue flag of the WARM chain in a separate array. + +If we have a Red WARM chain, then our goal is to remove Blue pointers +and vice versa. But there is a catch. For Index2 above, there is only +Blue pointer and that must not be removed. IOW we should remove Blue +pointer iff a Red pointer exists. Since index vacuum may visit Red and +Blue pointers in any order, I think we will need another index pass to +remove dead index pointers. So in the first index pass we check which +WARM candidates have 2 index pointers. In the second pass, we remove the +dead pointer and reset Red flag is the surviving index pointer is Red. + +During the second heap scan, we fix WARM chain by clearing +HEAP_WARM_TUPLE flag and also reset Red flag to Blue. + +There are some more problems around aborted vacuums. For example, if +vacuum aborts after changing Red index flag to Blue but before removing +the other Blue pointer, we will end up with two Blue pointers to a Red +WARM chain. But since the HEAP_WARM_TUPLE flag on the heap tuple is +still set, further WARM updates to the chain will be blocked. I guess we +will need some special handling for case with multiple Blue pointers. We +can either leave these WARM chains alone and let them die with a +subsequent non-WARM update or must apply heap-recheck logic during index +vacuum to find the dead pointer. Given that vacuum-aborts are not +common, I am inclined to leave this case unhandled. We must still check +for presence of multiple Blue pointers and ensure that we don't +accidently remove either of the Blue pointers and not clear WARM chains +either. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ccf84be..800a7c0 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -99,7 +99,10 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, static void HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, Bitmapset *key_attrs, Bitmapset *id_attrs, - bool *satisfies_hot, bool *satisfies_key, + Bitmapset *exprindx_attrs, + Bitmapset **updated_attrs, + bool *satisfies_hot, bool *satisfies_warm, + bool *satisfies_key, bool *satisfies_id, HeapTuple oldtup, HeapTuple newtup); static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, @@ -1960,6 +1963,76 @@ heap_fetch(Relation relation, } /* + * Check if the HOT chain is originating or continuing at tid ever became a + * WARM chain, even if the actual UPDATE operation finally aborted. + */ +static void +hot_check_warm_chain(Page dp, ItemPointer tid, bool *recheck) +{ + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + HeapTupleData heapTuple; + + if (*recheck == true) + return; + + offnum = ItemPointerGetOffsetNumber(tid); + heapTuple.t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + break; + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + ItemPointerSetOffsetNumber(&heapTuple.t_self, offnum); + + /* + * The xmin should match the previous xmax value, else chain is + * broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + + /* + * Presence of either WARM or WARM updated tuple signals possible + * breakage and the caller must recheck tuple returned from this chain + * for index satisfaction + */ + if (HeapTupleHeaderIsHeapWarmTuple(heapTuple.t_data)) + { + *recheck = true; + break; + } + + /* + * Check to see if HOT chain continues past this tuple; if so fetch + * the next offnum and loop around. + */ + if (HeapTupleIsHotUpdated(&heapTuple)) + { + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple.t_data); + } + else + break; /* end of chain */ + } + +} + +/* * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot * * On entry, *tid is the TID of a tuple (either a simple tuple, or the root @@ -1979,11 +2052,14 @@ heap_fetch(Relation relation, * Unlike heap_fetch, the caller must already have pin and (at least) share * lock on the buffer; it is still pinned/locked at exit. Also unlike * heap_fetch, we do not report any pgstats count; caller may do so if wanted. + * + * recheck should be set false on entry by caller, will be set true on exit + * if a WARM tuple is encountered. */ bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call) + bool *all_dead, bool first_call, bool *recheck) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; @@ -2025,6 +2101,16 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; + + /* Check if it's a WARM chain */ + if (recheck && *recheck == false) + { + if (ItemIdIsHeapWarm(lp)) + { + *recheck = true; + Assert(!IsSystemRelation(relation)); + } + } continue; } /* else must be end of chain */ @@ -2037,9 +2123,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum); /* - * Shouldn't see a HEAP_ONLY tuple at chain start. + * Shouldn't see a HEAP_ONLY tuple at chain start, unless we are + * dealing with a WARM updated tuple in which case deferred triggers + * may request to fetch a WARM tuple from middle of a chain. */ - if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple) && + !HeapTupleIsHeapWarmTuple(heapTuple)) break; /* @@ -2052,6 +2141,22 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, break; /* + * Check if there exists a WARM tuple somewhere down the chain and set + * recheck to TRUE. + * + * XXX This is not very efficient right now, and we should look for + * possible improvements here + */ + if (recheck && *recheck == false) + { + hot_check_warm_chain(dp, &heapTuple->t_self, recheck); + + /* WARM is not supported on system tables yet */ + if (*recheck == true) + Assert(!IsSystemRelation(relation)); + } + + /* * When first_call is true (and thus, skip is initially false) we'll * return the first tuple we find. But on later passes, heapTuple * will initially be pointing to the tuple we returned last time. @@ -2124,18 +2229,41 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, */ bool heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, - bool *all_dead) + bool *all_dead, bool *recheck, Buffer *cbuffer, + HeapTuple heapTuple) { bool result; Buffer buffer; - HeapTupleData heapTuple; + ItemPointerData ret_tid = *tid; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); - result = heap_hot_search_buffer(tid, relation, buffer, snapshot, - &heapTuple, all_dead, true); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); + result = heap_hot_search_buffer(&ret_tid, relation, buffer, snapshot, + heapTuple, all_dead, true, recheck); + + /* + * If we are returning a potential candidate tuple from this chain and the + * caller has requested for "recheck" hint, keep the buffer locked and + * pinned. The caller must release the lock and pin on the buffer in all + * such cases + */ + if (!result || !recheck || !(*recheck)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + + /* + * Set the caller supplied tid with the actual location of the tuple being + * returned + */ + if (result) + { + *tid = ret_tid; + if (cbuffer) + *cbuffer = buffer; + } + return result; } @@ -3442,13 +3570,15 @@ simple_heap_delete(Relation relation, ItemPointer tid) HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode) + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **updated_attrs, bool *warm_update) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); Bitmapset *hot_attrs; Bitmapset *key_attrs; Bitmapset *id_attrs; + Bitmapset *exprindx_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; @@ -3469,9 +3599,11 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool have_tuple_lock = false; bool iscombo; bool satisfies_hot; + bool satisfies_warm; bool satisfies_key; bool satisfies_id; bool use_hot_update = false; + bool use_warm_update = false; bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; @@ -3496,6 +3628,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("cannot update tuples during a parallel operation"))); + /* Assume no-warm update */ + if (warm_update) + *warm_update = false; + /* * Fetch the list of attributes to be checked for HOT update. This is * wasted effort if we fail to update or have to put the new tuple on a @@ -3512,6 +3648,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY); + exprindx_attrs = RelationGetIndexAttrBitmap(relation, + INDEX_ATTR_BITMAP_EXPR_PREDICATE); block = ItemPointerGetBlockNumber(otid); offnum = ItemPointerGetOffsetNumber(otid); @@ -3571,7 +3709,10 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, * serendipitiously arrive at the same key values. */ HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs, id_attrs, - &satisfies_hot, &satisfies_key, + exprindx_attrs, + updated_attrs, + &satisfies_hot, &satisfies_warm, + &satisfies_key, &satisfies_id, &oldtup, newtup); if (satisfies_key) { @@ -4118,6 +4259,34 @@ l2: */ if (satisfies_hot) use_hot_update = true; + else + { + /* + * If no WARM updates yet on this chain, let this update be a WARM + * update. + * + * We check for both warm and warm updated tuples since if the + * previous WARM update aborted, we may still have added + * another index entry for this HOT chain. In such situations, we + * must not attempt a WARM update until duplicate (key, CTID) index + * entry issue is sorted out + * + * XXX Later we'll add more checks to ensure WARM chains can + * further be WARM updated. This is probably good to do first rounf + * of tests of remaining functionality + * + * XXX Disable WARM updates on system tables. There is nothing in + * principle that stops us from supporting this. But it would + * require API change to propogate the changed columns back to the + * caller so that CatalogUpdateIndexes() can avoid adding new + * entries to indexes that are not changed by update. This will be + * fixed once basic patch is tested. !!FIXME + */ + if (satisfies_warm && + !HeapTupleIsHeapWarmTuple(&oldtup) && + !IsSystemRelation(relation)) + use_warm_update = true; + } } else { @@ -4158,6 +4327,21 @@ l2: HeapTupleSetHeapOnly(heaptup); /* Mark the caller's copy too, in case different from heaptup */ HeapTupleSetHeapOnly(newtup); + + /* + * Even if we are doing a HOT update, we must carry forward the WARM + * flag because we may have already inserted another index entry + * pointing to our root and a third entry may create duplicates + * + * XXX This should be revisited if we get index (key, CTID) duplicate + * detection mechanism in place + */ + if (HeapTupleIsHeapWarmTuple(&oldtup)) + { + HeapTupleSetHeapWarmTuple(heaptup); + HeapTupleSetHeapWarmTuple(newtup); + } + /* * For HOT (or WARM) updated tuples, we store the offset of the root * line pointer of this chain in the ip_posid field of the new tuple. @@ -4173,12 +4357,38 @@ l2: ItemPointerGetOffsetNumber(&(oldtup.t_self)), &root_offnum); } + else if (use_warm_update) + { + Assert(!IsSystemRelation(relation)); + + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + HeapTupleSetHeapWarmTuple(&oldtup); + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + HeapTupleSetHeapWarmTuple(heaptup); + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + HeapTupleSetHeapWarmTuple(newtup); + if (HeapTupleHeaderHasRootOffset(oldtup.t_data)) + root_offnum = HeapTupleHeaderGetRootOffset(oldtup.t_data); + else + heap_get_root_tuple_one(page, + ItemPointerGetOffsetNumber(&(oldtup.t_self)), + &root_offnum); + + /* Let the caller know we did a WARM update */ + if (warm_update) + *warm_update = true; + } else { /* Make sure tuples are correctly marked as not-HOT */ HeapTupleClearHotUpdated(&oldtup); HeapTupleClearHeapOnly(heaptup); HeapTupleClearHeapOnly(newtup); + HeapTupleClearHeapWarmTuple(heaptup); + HeapTupleClearHeapWarmTuple(newtup); root_offnum = InvalidOffsetNumber; } @@ -4297,7 +4507,12 @@ l2: if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); - pgstat_count_heap_update(relation, use_hot_update); + /* + * Even with WARM we still count stats using use_hot_update, + * since we continue to still use that term even though it is + * now more frequent that previously. + */ + pgstat_count_heap_update(relation, use_hot_update || use_warm_update); /* * If heaptup is a private copy, release it. Don't forget to copy t_self @@ -4405,6 +4620,13 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, * will be checking very similar sets of columns, and doing the same tests on * them, it makes sense to optimize and do them together. * + * The exprindx_attrs designates the set of attributes used in expression or + * predicate indexes. Currently, we don't allow WARM updates if expression or + * predicate index column is updated + * + * If updated_attrs is not NULL, then the caller is always interested in + * knowing the list of changed attributes + * * We receive three bitmapsets comprising the three sets of columns we're * interested in. Note these are destructively modified; that is OK since * this is invoked at most once in heap_update. @@ -4417,7 +4639,11 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, static void HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, Bitmapset *key_attrs, Bitmapset *id_attrs, - bool *satisfies_hot, bool *satisfies_key, + Bitmapset *exprindx_attrs, + Bitmapset **updated_attrs, + bool *satisfies_hot, + bool *satisfies_warm, + bool *satisfies_key, bool *satisfies_id, HeapTuple oldtup, HeapTuple newtup) { @@ -4454,8 +4680,11 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, * Since the HOT attributes are a superset of the key attributes and * the key attributes are a superset of the id attributes, this logic * is guaranteed to identify the next column that needs to be checked. + * + * If the caller also wants to know the list of updated index + * attributes, we must scan through all the attributes */ - if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber) + if ((hot_result || updated_attrs) && next_hot_attnum > FirstLowInvalidHeapAttributeNumber) check_now = next_hot_attnum; else if (key_result && next_key_attnum > FirstLowInvalidHeapAttributeNumber) check_now = next_key_attnum; @@ -4476,8 +4705,16 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, if (check_now == next_id_attnum) id_result = false; + /* + * Add the changed attribute to updated_attrs if the caller has + * asked for it + */ + if (updated_attrs) + *updated_attrs = bms_add_member(*updated_attrs, check_now - + FirstLowInvalidHeapAttributeNumber); + /* if all are false now, we can stop checking */ - if (!hot_result && !key_result && !id_result) + if (!hot_result && !key_result && !id_result && !updated_attrs) break; } @@ -4488,7 +4725,7 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, * bms_first_member() will return -1 and the attribute number will end * up with a value less than FirstLowInvalidHeapAttributeNumber. */ - if (hot_result && check_now == next_hot_attnum) + if ((hot_result || updated_attrs) && check_now == next_hot_attnum) { next_hot_attnum = bms_first_member(hot_attrs); next_hot_attnum += FirstLowInvalidHeapAttributeNumber; @@ -4505,6 +4742,23 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs, } } + /* + * If an attributed used in the expression of an expression index or + * predicate of a predicate index has changed, we don't yet support WARM + * update + */ + if (updated_attrs && bms_overlap(*updated_attrs, exprindx_attrs)) + *satisfies_warm = false; + /* If the table does not support WARM update, honour that */ + else if (!relation->rd_supportswarm) + *satisfies_warm = false; + /* + * XXX Should we handle some more cases, such as when an update will result + * in many or most indexes, should we fall back to a regular update? + */ + else + *satisfies_warm = true; + *satisfies_hot = hot_result; *satisfies_key = key_result; *satisfies_id = id_result; @@ -4528,7 +4782,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, NULL, NULL); switch (result) { case HeapTupleSelfUpdated: @@ -7415,6 +7669,7 @@ log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, + OffsetNumber *warm, int nwarm, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid) @@ -7428,6 +7683,7 @@ log_heap_clean(Relation reln, Buffer buffer, xlrec.latestRemovedXid = latestRemovedXid; xlrec.nredirected = nredirected; xlrec.ndead = ndead; + xlrec.nwarm = nwarm; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapClean); @@ -7450,6 +7706,10 @@ log_heap_clean(Relation reln, Buffer buffer, XLogRegisterBufData(0, (char *) nowdead, ndead * sizeof(OffsetNumber)); + if (nwarm > 0) + XLogRegisterBufData(0, (char *) warm, + nwarm * sizeof(OffsetNumber)); + if (nunused > 0) XLogRegisterBufData(0, (char *) nowunused, nunused * sizeof(OffsetNumber)); @@ -7555,6 +7815,7 @@ log_heap_update(Relation reln, Buffer oldbuf, bool need_tuple_data = RelationIsLogicallyLogged(reln); bool init; int bufflags; + bool warm_update; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); @@ -7566,6 +7827,9 @@ log_heap_update(Relation reln, Buffer oldbuf, else info = XLOG_HEAP_UPDATE; + if (HeapTupleIsHeapWarmTuple(newtup)) + warm_update = true; + /* * If the old and new tuple are on the same page, we only need to log the * parts of the new tuple that were changed. That saves on the amount of @@ -7639,6 +7903,8 @@ log_heap_update(Relation reln, Buffer oldbuf, xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; } } + if (warm_update) + xlrec.flags |= XLH_UPDATE_WARM_UPDATE; /* If new tuple is the single and first tuple on page... */ if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && @@ -8006,24 +8272,38 @@ heap_xlog_clean(XLogReaderState *record) OffsetNumber *redirected; OffsetNumber *nowdead; OffsetNumber *nowunused; + OffsetNumber *warm; int nredirected; int ndead; int nunused; + int nwarm; + int i; Size datalen; + bool warmchain[MaxHeapTuplesPerPage + 1]; redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen); nredirected = xlrec->nredirected; ndead = xlrec->ndead; + nwarm = xlrec->nwarm; + end = (OffsetNumber *) ((char *) redirected + datalen); nowdead = redirected + (nredirected * 2); - nowunused = nowdead + ndead; - nunused = (end - nowunused); + warm = nowdead + ndead; + nowunused = warm + nwarm; + + nunused = (end - warm); Assert(nunused >= 0); + memset(warmchain, 0, sizeof (warmchain)); + for (i = 0; i < nwarm; i++) + warmchain[warm[i]] = true; + + /* Update all item pointers per the record, and repair fragmentation */ heap_page_prune_execute(buffer, redirected, nredirected, + warmchain, nowdead, ndead, nowunused, nunused); @@ -8610,16 +8890,22 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) Size freespace = 0; XLogRedoAction oldaction; XLogRedoAction newaction; + bool warm_update = false; /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; + if (xlrec->flags & XLH_UPDATE_WARM_UPDATE) + warm_update = true; + XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk); if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk)) { /* HOT updates are never done across pages */ Assert(!hot_update); + /* WARM updates are never done across pages */ + Assert(!warm_update); } else oldblk = newblk; @@ -8679,6 +8965,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the old tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetHeapWarmTuple(htup); + /* Set forward chain link in t_ctid */ HeapTupleHeaderSetNextCtid(htup, ItemPointerGetBlockNumber(&newtid), ItemPointerGetOffsetNumber(&newtid)); @@ -8814,6 +9105,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); HeapTupleHeaderSetCmin(htup, FirstCommandId); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + + /* Mark the new tuple has a WARM tuple */ + if (warm_update) + HeapTupleHeaderSetHeapWarmTuple(htup); + /* Make sure there is no forward chain link in t_ctid */ HeapTupleHeaderSetHeapLatest(htup); diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 7c2231a..d71a297 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -36,12 +36,19 @@ typedef struct int nredirected; /* numbers of entries in arrays below */ int ndead; int nunused; + int nwarm; /* arrays that accumulate indexes of items to be changed */ OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; + OffsetNumber warm[MaxHeapTuplesPerPage]; /* marked[i] is TRUE if item i is entered in one of the above arrays */ bool marked[MaxHeapTuplesPerPage + 1]; + /* + * warmchain[i] is TRUE if item is becoming redirected lp and points a WARM + * chain + */ + bool warmchain[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ @@ -54,6 +61,8 @@ static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum); static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); +static void heap_prune_record_warmupdate(PruneState *prstate, + OffsetNumber offnum); static void heap_get_root_tuples_internal(Page page, OffsetNumber target_offnum, OffsetNumber *root_offsets); @@ -203,8 +212,9 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, */ prstate.new_prune_xid = InvalidTransactionId; prstate.latestRemovedXid = *latestRemovedXid; - prstate.nredirected = prstate.ndead = prstate.nunused = 0; + prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nwarm = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); + memset(prstate.warmchain, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); @@ -241,6 +251,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, + prstate.warmchain, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); @@ -268,6 +279,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, + prstate.warm, prstate.nwarm, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, prstate.latestRemovedXid); @@ -479,6 +491,12 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) break; + if (HeapTupleHeaderIsHeapWarmTuple(htup)) + { + Assert(!IsSystemRelation(relation)); + heap_prune_record_warmupdate(prstate, rootoffnum); + } + /* * OK, this tuple is indeed a member of the chain. */ @@ -668,6 +686,18 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) prstate->marked[offnum] = true; } +/* Record item pointer which is a root of a WARM chain */ +static void +heap_prune_record_warmupdate(PruneState *prstate, OffsetNumber offnum) +{ + Assert(prstate->nwarm < MaxHeapTuplesPerPage); + if (prstate->warmchain[offnum]) + return; + prstate->warm[prstate->nwarm] = offnum; + prstate->nwarm++; + prstate->warmchain[offnum] = true; +} + /* * Perform the actual page changes needed by heap_page_prune. @@ -681,6 +711,7 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, + bool *warmchain, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused) { @@ -697,6 +728,12 @@ heap_page_prune_execute(Buffer buffer, ItemId fromlp = PageGetItemId(page, fromoff); ItemIdSetRedirect(fromlp, tooff); + + /* + * Save information about WARM chains in the item itself + */ + if (warmchain[fromoff]) + ItemIdSetHeapWarm(fromlp); } /* Update all now-dead line pointers */ diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 65c941d..4f9fb12 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -99,7 +99,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) else scan->orderByData = NULL; - scan->xs_want_itup = false; /* may be set later */ + scan->xs_want_itup = true; /* hack for now to always get index tuple */ /* * During recovery we ignore killed tuples and don't bother to kill them diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 54b71cb..149a02d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -71,10 +71,12 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/index.h" +#include "executor/executor.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/datum.h" #include "utils/snapmgr.h" #include "utils/tqual.h" @@ -409,7 +411,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_ctup.t_self. It should also set - * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention + * scan->xs_tuple_recheck and possibly scan->xs_itup, though we pay no attention * to those fields here. */ found = scan->indexRelation->rd_amroutine->amgettuple(scan, direction); @@ -448,7 +450,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan * call). * - * Note: caller must check scan->xs_recheck, and perform rechecking of the + * Note: caller must check scan->xs_tuple_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- @@ -475,6 +477,13 @@ index_fetch_heap(IndexScanDesc scan) */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf); + + /* + * If we're not always re-checking, reset recheck for this tuple + */ + if (!scan->xs_recheck) + scan->xs_tuple_recheck = false; + } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -484,32 +493,63 @@ index_fetch_heap(IndexScanDesc scan) scan->xs_snapshot, &scan->xs_ctup, &all_dead, - !scan->xs_continue_hot); + !scan->xs_continue_hot, + &scan->xs_tuple_recheck); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); if (got_heap_tuple) { + bool res = true; + + /* + * Ok we got a tuple which satisfies the snapshot, but if its part of a + * WARM chain, we must do additional checks to ensure that we are + * indeed returning a correct tuple. Note that if the index AM does not + * implement amrecheck method, then we don't any additional checks + * since WARM must have been disabled on such tables + * + * XXX What happens when a new index which does not support amcheck is + * added to the table? Do we need to handle this case or is CREATE + * INDEX and CREATE INDEX CONCURRENTLY smart enough to handle this + * issue? + */ + if (scan->xs_tuple_recheck && + scan->indexRelation->rd_amroutine->amrecheck) + { + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + res = scan->indexRelation->rd_amroutine->amrecheck( + scan->indexRelation, + scan->xs_itup, + scan->heapRelation, + &scan->xs_ctup); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + } + /* * Only in a non-MVCC snapshot can more than one member of the HOT * chain be visible. */ scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); pgstat_count_heap_fetch(scan->indexRelation); - return &scan->xs_ctup; - } - /* We've reached the end of the HOT chain. */ - scan->xs_continue_hot = false; + if (res) + return &scan->xs_ctup; + } + else + { + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + } return NULL; } diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index ef69290..e0afffd 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -19,11 +19,14 @@ #include "access/nbtree.h" #include "access/transam.h" #include "access/xloginsert.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "utils/tqual.h" - +#include "utils/datum.h" typedef struct { @@ -249,6 +252,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + Buffer buffer; + HeapTupleData heapTuple; + bool recheck = false; /* Assume unique until we find a duplicate */ *is_unique = true; @@ -308,6 +314,8 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, curitup = (IndexTuple) PageGetItem(page, curitemid); htid = curitup->t_tid; + recheck = false; + /* * If we are doing a recheck, we expect to find the tuple we * are rechecking. It's not a duplicate, but we have to keep @@ -325,112 +333,153 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * have just a single index entry for the entire chain. */ else if (heap_hot_search(&htid, heapRel, &SnapshotDirty, - &all_dead)) + &all_dead, &recheck, &buffer, + &heapTuple)) { TransactionId xwait; + bool result = true; /* - * It is a duplicate. If we are only doing a partial - * check, then don't bother checking if the tuple is being - * updated in another transaction. Just return the fact - * that it is a potential conflict and leave the full - * check till later. + * If the tuple was WARM update, we may again see our own + * tuple. Since WARM updates don't create new index + * entries, our own tuple is only reachable via the old + * index pointer */ - if (checkUnique == UNIQUE_CHECK_PARTIAL) + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - *is_unique = false; - return InvalidTransactionId; + found = true; + result = false; + if (recheck) + UnlockReleaseBuffer(buffer); } - - /* - * If this tuple is being updated by other transaction - * then we have to wait for its commit/abort. - */ - xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? - SnapshotDirty.xmin : SnapshotDirty.xmax; - - if (TransactionIdIsValid(xwait)) + else if (recheck) { - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - /* Tell _bt_doinsert to wait... */ - *speculativeToken = SnapshotDirty.speculativeToken; - return xwait; + result = btrecheck(rel, curitup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); } - /* - * Otherwise we have a definite conflict. But before - * complaining, look to see if the tuple we want to insert - * is itself now committed dead --- if so, don't complain. - * This is a waste of time in normal scenarios but we must - * do it to support CREATE INDEX CONCURRENTLY. - * - * We must follow HOT-chains here because during - * concurrent index build, we insert the root TID though - * the actual tuple may be somewhere in the HOT-chain. - * While following the chain we might not stop at the - * exact tuple which triggered the insert, but that's OK - * because if we find a live tuple anywhere in this chain, - * we have a unique key conflict. The other live tuple is - * not part of this chain because it had a different index - * entry. - */ - htid = itup->t_tid; - if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL)) - { - /* Normal case --- it's still live */ - } - else + if (result) { /* - * It's been deleted, so no error, and no need to - * continue searching + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. */ - break; - } + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } - /* - * Check for a conflict-in as we would if we were going to - * write to this page. We aren't actually going to write, - * but we want a chance to report SSI conflicts that would - * otherwise be masked by this unique constraint - * violation. - */ - CheckForSerializableConflictIn(rel, NULL, buf); + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; + + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + return xwait; + } - /* - * This is a definite conflict. Break the tuple down into - * datums and report the error. But first, make sure we - * release the buffer locks we're holding --- - * BuildIndexValueDescription could make catalog accesses, - * which in the worst case might touch this same index and - * cause deadlocks. - */ - if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf); - _bt_relbuf(rel, buf); + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + recheck = false; + ItemPointerCopy(&itup->t_tid, &htid); + if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL, + &recheck, &buffer, &heapTuple)) + { + bool result = true; + if (recheck) + { + /* + * Recheck if the tuple actually satisfies the + * index key. Otherwise, we might be following + * a wrong index pointer and mustn't entertain + * this tuple + */ + result = btrecheck(rel, itup, heapRel, &heapTuple); + UnlockReleaseBuffer(buffer); + } + if (!result) + break; + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching + */ + break; + } - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; - - index_deform_tuple(itup, RelationGetDescr(rel), - values, isnull); - - key_desc = BuildIndexValueDescription(rel, values, - isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("duplicate key value violates unique constraint \"%s\"", - RelationGetRelationName(rel)), - key_desc ? errdetail("Key %s already exists.", - key_desc) : 0, - errtableconstraint(heapRel, - RelationGetRelationName(rel)))); + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, buf); + + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, buf); + + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } } } else if (all_dead) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 128744c..6b1236a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -23,6 +23,7 @@ #include "access/xlog.h" #include "catalog/index.h" #include "commands/vacuum.h" +#include "executor/nodeIndexscan.h" #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" @@ -117,6 +118,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amendscan = btendscan; amroutine->ammarkpos = btmarkpos; amroutine->amrestrpos = btrestrpos; + amroutine->amrecheck = btrecheck; PG_RETURN_POINTER(amroutine); } @@ -292,8 +294,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; - /* btree indexes are never lossy */ - scan->xs_recheck = false; + /* btree indexes are never lossy, except for WARM tuples */ + scan->xs_recheck = indexscan_recheck; + scan->xs_tuple_recheck = indexscan_recheck; /* * If we have any array keys, initialize them during first call for a diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 063c988..c9c0501 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -20,11 +20,15 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/index.h" +#include "executor/executor.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "utils/array.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/datum.h" typedef struct BTSortArrayContext @@ -2065,3 +2069,103 @@ btproperty(Oid index_oid, int attno, return false; /* punt to generic code */ } } + +/* + * Check if the index tuple's key matches the one computed from the given heap + * tuple's attribute + */ +bool +btrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple) +{ + IndexInfo *indexInfo; + EState *estate; + ExprContext *econtext; + TupleTableSlot *slot; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + int i; + bool equal; + int natts = indexRel->rd_rel->relnatts; + Form_pg_attribute att; + + /* Get IndexInfo for this index */ + indexInfo = BuildIndexInfo(indexRel); + + /* + * The heap tuple must be put into a slot for FormIndexDatum. + */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); + + ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); + + /* + * Typically the index won't have expressions, but if it does we need an + * EState to evaluate them. We need it for exclusion constraints too, + * even if they are just on simple columns. + */ + if (indexInfo->ii_Expressions != NIL || + indexInfo->ii_ExclusionOps != NULL) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = slot; + } + else + estate = NULL; + + /* + * Form the index values and isnull flags for the index entry that we need + * to check. + * + * Note: if the index uses functions that are not as immutable as they are + * supposed to be, this could produce an index tuple different from the + * original. The index AM can catch such errors by verifying that it + * finds a matching index entry with the tuple's TID. For exclusion + * constraints we check this in check_exclusion_constraint(). + */ + FormIndexDatum(indexInfo, slot, estate, values, isnull); + + equal = true; + for (i = 1; i <= natts; i++) + { + Datum indxvalue; + bool indxisnull; + + indxvalue = index_getattr(indexTuple, i, indexRel->rd_att, &indxisnull); + + /* + * If both are NULL, then they are equal + */ + if (isnull[i - 1] && indxisnull) + continue; + + /* + * If just one is NULL, then they are not equal + */ + if (isnull[i - 1] || indxisnull) + { + equal = false; + break; + } + + /* + * Now just do a raw memory comparison. If the index tuple was formed + * using this heap tuple, the computed index values must match + */ + att = indexRel->rd_att->attrs[i - 1]; + if (!datumIsEqual(values[i - 1], indxvalue, att->attbyval, + att->attlen)) + { + equal = false; + break; + } + } + + if (estate != NULL) + FreeExecutorState(estate); + + ExecDropSingleTupleTableSlot(slot); + + return equal; +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index d570ae5..813b5c3 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -67,6 +67,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amendscan = spgendscan; amroutine->ammarkpos = NULL; amroutine->amrestrpos = NULL; + amroutine->amrecheck = NULL; PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index b0b43cf..36467b2 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -54,6 +54,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" +#include "optimizer/var.h" #include "parser/parser.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" @@ -1674,6 +1675,20 @@ BuildIndexInfo(Relation index) ii->ii_Concurrent = false; ii->ii_BrokenHotChain = false; + /* build a bitmap of all table attributes referred by this index */ + for (i = 0; i < ii->ii_NumIndexAttrs; i++) + { + AttrNumber attr = ii->ii_KeyAttrNumbers[i]; + ii->ii_indxattrs = bms_add_member(ii->ii_indxattrs, attr - + FirstLowInvalidHeapAttributeNumber); + } + + /* Collect all attributes used in expressions, too */ + pull_varattnos((Node *) ii->ii_Expressions, 1, &ii->ii_indxattrs); + + /* Collect all attributes in the index predicate, too */ + pull_varattnos((Node *) ii->ii_Predicate, 1, &ii->ii_indxattrs); + return ii; } diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 26f9114..997c8f5 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -40,6 +40,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) TriggerData *trigdata = (TriggerData *) fcinfo->context; const char *funcname = "unique_key_recheck"; HeapTuple new_row; + HeapTupleData heapTuple; ItemPointerData tmptid; Relation indexRel; IndexInfo *indexInfo; @@ -102,7 +103,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) * removed. */ tmptid = new_row->t_self; - if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL)) + if (!heap_hot_search(&tmptid, trigdata->tg_relation, SnapshotSelf, NULL, + NULL, NULL, &heapTuple)) { /* * All rows in the HOT chain are dead, so skip the check. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 5947e72..75af34c 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2491,6 +2491,7 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, false, NULL, NIL); @@ -2606,6 +2607,7 @@ CopyFromInsertBatch(CopyState cstate, EState *estate, CommandId mycid, ExecStoreTuple(bufferedTuples[i], myslot, InvalidBuffer, false); recheckIndexes = ExecInsertIndexTuples(myslot, &(bufferedTuples[i]->t_self), + &(bufferedTuples[i]->t_self), NULL, estate, false, NULL, NIL); ExecARInsertTriggers(estate, resultRelInfo, bufferedTuples[i], diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 231e92d..ca40e1b 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -1468,6 +1468,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, + NULL, 0, unused, uncnt, vacrelstats->latestRemovedXid); PageSetLSN(page, recptr); @@ -2128,6 +2129,22 @@ heap_page_is_all_visible(Relation rel, Buffer buf, break; } + /* + * If this tuple was ever WARM updated or is a WARM tuple, + * there could be multiple index entries pointing to the + * root of this chain. We can't do index-only scans for + * such tuples without verifying index key check. So mark + * the page as !all_visible + * + * XXX Should we look at the root line pointer and check if + * WARM flag is set there or checking for tuples in the + * chain is good enough? + */ + if (HeapTupleHeaderIsHeapWarmTuple(tuple.t_data)) + { + all_visible = false; + } + /* Track newest xmin on page. */ if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) *visibility_cutoff_xid = xmin; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 0e2d834..da27cf6 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -270,6 +270,8 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) List * ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, + Bitmapset *updated_attrs, EState *estate, bool noDupErr, bool *specConflict, @@ -324,6 +326,17 @@ ExecInsertIndexTuples(TupleTableSlot *slot, if (!indexInfo->ii_ReadyForInserts) continue; + /* + * If updated_attrs is set, we only insert index entries for those + * indexes whose column has changed. All other indexes can use their + * existing index pointers to look up the new tuple + */ + if (updated_attrs) + { + if (!bms_overlap(updated_attrs, indexInfo->ii_indxattrs)) + continue; + } + /* Check for partial index */ if (indexInfo->ii_Predicate != NIL) { @@ -389,7 +402,7 @@ ExecInsertIndexTuples(TupleTableSlot *slot, index_insert(indexRelation, /* index relation */ values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + root_tid, /* tid of heap or root tuple */ heapRelation, /* heap relation */ checkUnique); /* type of uniqueness check to do */ diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 449aacb..ff77349 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -37,6 +37,7 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/valid.h" #include "executor/execdebug.h" #include "executor/nodeBitmapHeapscan.h" #include "pgstat.h" @@ -362,11 +363,23 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; HeapTupleData heapTuple; + bool recheck = false; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, - &heapTuple, NULL, true)) - scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + &heapTuple, NULL, true, &recheck)) + { + bool valid = true; + + if (scan->rs_key) + HeapKeyTest(&heapTuple, RelationGetDescr(scan->rs_rd), + scan->rs_nkeys, scan->rs_key, valid); + if (valid) + scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + + if (recheck) + tbmres->recheck = true; + } } } else diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 4f6f91c..49bda34 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -141,6 +141,26 @@ IndexOnlyNext(IndexOnlyScanState *node) * but it's not clear whether it's a win to do so. The next index * entry might require a visit to the same heap page. */ + + /* + * If the index was lossy or the tuple was WARM, we have to recheck + * the index quals using the fetched tuple. + */ + if (scandesc->xs_tuple_recheck) + { + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + scandesc->xs_cbuf, /* buffer containing tuple */ + false); /* don't pfree */ + econtext->ecxt_scantuple = slot; + ResetExprContext(econtext); + if (!ExecQual(node->indexqual, econtext, false)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + continue; + } + } } /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 3143bd9..0b04bb8 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -39,6 +39,8 @@ #include "utils/memutils.h" #include "utils/rel.h" +bool indexscan_recheck = false; + /* * When an ordering operator is used, tuples fetched from the index that * need to be reordered are queued in a pairing heap, as ReorderTuples. @@ -115,10 +117,10 @@ IndexNext(IndexScanState *node) false); /* don't pfree */ /* - * If the index was lossy, we have to recheck the index quals using - * the fetched tuple. + * If the index was lossy or the tuple was WARM, we have to recheck + * the index quals using the fetched tuple. */ - if (scandesc->xs_recheck) + if (scandesc->xs_tuple_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index af7b26c..11bd3c0 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -433,6 +433,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, true, &specConflict, arbiterIndexes); @@ -479,6 +480,7 @@ ExecInsert(ModifyTableState *mtstate, /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &(tuple->t_self), NULL, estate, false, NULL, arbiterIndexes); } @@ -809,6 +811,9 @@ ExecUpdate(ItemPointer tupleid, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + Bitmapset *updated_attrs = NULL; + ItemPointerData root_tid; + bool warm_update; /* * abort the operation if not running transactions @@ -923,7 +928,7 @@ lreplace:; estate->es_output_cid, estate->es_crosscheck_snapshot, true /* wait for commit */ , - &hufd, &lockmode); + &hufd, &lockmode, &updated_attrs, &warm_update); switch (result) { case HeapTupleSelfUpdated: @@ -1010,10 +1015,28 @@ lreplace:; * the t_self field. * * If it's a HOT update, we mustn't insert new index entries. + * + * If it's a WARM update, then we must insert new entries with TID + * pointing to the root of the WARM chain. */ - if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) + if (resultRelInfo->ri_NumIndices > 0 && + (!HeapTupleIsHeapOnly(tuple) || warm_update)) + { + if (warm_update) + ItemPointerSet(&root_tid, + ItemPointerGetBlockNumber(&(tuple->t_self)), + HeapTupleHeaderGetRootOffset(tuple->t_data)); + else + { + ItemPointerCopy(&tuple->t_self, &root_tid); + bms_free(updated_attrs); + updated_attrs = NULL; + } recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + &root_tid, + updated_attrs, estate, false, NULL, NIL); + } } if (canSetTag) diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 79e0b1f..37874ca 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2030,6 +2030,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) list_free_deep(relation->rd_fkeylist); list_free(relation->rd_indexlist); bms_free(relation->rd_indexattr); + bms_free(relation->rd_exprindexattr); bms_free(relation->rd_keyattr); bms_free(relation->rd_idattr); if (relation->rd_options) @@ -4373,12 +4374,15 @@ Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) { Bitmapset *indexattrs; /* indexed columns */ + Bitmapset *exprindexattrs; /* indexed columns in expression/prediacate + indexes */ Bitmapset *uindexattrs; /* columns in unique indexes */ Bitmapset *idindexattrs; /* columns in the replica identity */ List *indexoidlist; Oid relreplindex; ListCell *l; MemoryContext oldcxt; + bool supportswarm = true;/* True if the table can be WARM updated */ /* Quick exit if we already computed the result. */ if (relation->rd_indexattr != NULL) @@ -4391,6 +4395,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) return bms_copy(relation->rd_keyattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return bms_copy(relation->rd_idattr); + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return bms_copy(relation->rd_exprindexattr); default: elog(ERROR, "unknown attrKind %u", attrKind); } @@ -4429,6 +4435,7 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) * won't be returned at all by RelationGetIndexList. */ indexattrs = NULL; + exprindexattrs = NULL; uindexattrs = NULL; idindexattrs = NULL; foreach(l, indexoidlist) @@ -4474,19 +4481,32 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &exprindexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &exprindexattrs); + + /* + * Check if the index has amrecheck method defined. If the method is + * not defined, the index does not support WARM update. Completely + * disable WARM updates on such tables + */ + if (!indexDesc->rd_amroutine->amrecheck) + supportswarm = false; index_close(indexDesc, AccessShareLock); } list_free(indexoidlist); + /* Remember if the table can do WARM updates */ + relation->rd_supportswarm = supportswarm; + /* Don't leak the old values of these bitmaps, if any */ bms_free(relation->rd_indexattr); relation->rd_indexattr = NULL; + bms_free(relation->rd_exprindexattr); + relation->rd_exprindexattr = NULL; bms_free(relation->rd_keyattr); relation->rd_keyattr = NULL; bms_free(relation->rd_idattr); @@ -4502,7 +4522,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) oldcxt = MemoryContextSwitchTo(CacheMemoryContext); relation->rd_keyattr = bms_copy(uindexattrs); relation->rd_idattr = bms_copy(idindexattrs); - relation->rd_indexattr = bms_copy(indexattrs); + relation->rd_exprindexattr = bms_copy(exprindexattrs); + relation->rd_indexattr = bms_copy(bms_union(indexattrs, exprindexattrs)); MemoryContextSwitchTo(oldcxt); /* We return our original working copy for caller to play with */ @@ -4514,6 +4535,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) return uindexattrs; case INDEX_ATTR_BITMAP_IDENTITY_KEY: return idindexattrs; + case INDEX_ATTR_BITMAP_EXPR_PREDICATE: + return exprindexattrs; default: elog(ERROR, "unknown attrKind %u", attrKind); return NULL; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c5178f7..aa7b265 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -111,6 +111,7 @@ extern char *default_tablespace; extern char *temp_tablespaces; extern bool ignore_checksum_failure; extern bool synchronize_seqscans; +extern bool indexscan_recheck; #ifdef TRACE_SYNCSCAN extern bool trace_syncscan; @@ -1271,6 +1272,16 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"indexscan_recheck", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Recheck heap rows returned from an index scan."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &indexscan_recheck, + false, + NULL, NULL, NULL + }, + { {"debug_deadlocks", PGC_SUSET, DEVELOPER_OPTIONS, gettext_noop("Dumps information about all current locks when a deadlock timeout occurs."), NULL, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 1036cca..37eaf76 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -13,6 +13,7 @@ #define AMAPI_H #include "access/genam.h" +#include "access/itup.h" /* * We don't wish to include planner header files here, since most of an index @@ -137,6 +138,9 @@ typedef void (*ammarkpos_function) (IndexScanDesc scan); /* restore marked scan position */ typedef void (*amrestrpos_function) (IndexScanDesc scan); +/* recheck index tuple and heap tuple match */ +typedef bool (*amrecheck_function) (Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); /* * API struct for an index AM. Note this must be stored in a single palloc'd @@ -196,6 +200,7 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amrecheck_function amrecheck; /* can be NULL */ } IndexAmRoutine; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index d9df904..a25ce5a 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -364,4 +364,8 @@ extern bool _hash_convert_tuple(Relation index, extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); +/* hash.c */ +extern bool hashrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); + #endif /* HASH_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 94b46b8..4c05947 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -137,9 +137,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, Relation stats_relation); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); + bool *all_dead, bool first_call, bool *recheck); extern bool heap_hot_search(ItemPointer tid, Relation relation, - Snapshot snapshot, bool *all_dead); + Snapshot snapshot, bool *all_dead, + bool *recheck, Buffer *buffer, HeapTuple heapTuple); extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); @@ -160,7 +161,8 @@ extern void heap_abort_speculative(Relation relation, HeapTuple tuple); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - HeapUpdateFailureData *hufd, LockTupleMode *lockmode); + HeapUpdateFailureData *hufd, LockTupleMode *lockmode, + Bitmapset **updated_attrs, bool *warm_update); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_update, @@ -186,6 +188,7 @@ extern int heap_page_prune(Relation relation, Buffer buffer, bool report_stats, TransactionId *latestRemovedXid); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, + bool *warmchain, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused); extern void heap_get_root_tuple_one(Page page, OffsetNumber target_offnum, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 5a04561..ddc3a7a 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -80,6 +80,7 @@ #define XLH_UPDATE_CONTAINS_NEW_TUPLE (1<<4) #define XLH_UPDATE_PREFIX_FROM_OLD (1<<5) #define XLH_UPDATE_SUFFIX_FROM_OLD (1<<6) +#define XLH_UPDATE_WARM_UPDATE (1<<7) /* convenience macro for checking whether any form of old tuple was logged */ #define XLH_UPDATE_CONTAINS_OLD \ @@ -211,7 +212,9 @@ typedef struct xl_heap_update * * for each redirected item: the item offset, then the offset redirected to * * for each now-dead item: the item offset * * for each now-unused item: the item offset - * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused. + * * for each now-warm item: the item offset + * The total number of OffsetNumbers is therefore + * 2*nredirected+ndead+nunused+nwarm. * Note that nunused is not explicitly stored, but may be found by reference * to the total record length. */ @@ -220,10 +223,11 @@ typedef struct xl_heap_clean TransactionId latestRemovedXid; uint16 nredirected; uint16 ndead; + uint16 nwarm; /* OFFSET NUMBERS are in the block reference 0 */ } xl_heap_clean; -#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) +#define SizeOfHeapClean (offsetof(xl_heap_clean, nwarm) + sizeof(uint16)) /* * Cleanup_info is required in some cases during a lazy VACUUM. @@ -384,6 +388,7 @@ extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, + OffsetNumber *warm, int nwarm, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index d01e0d8..3a51681 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -260,7 +260,8 @@ struct HeapTupleHeaderData * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x0800 are available */ +#define HEAP_WARM_TUPLE 0x0800 /* This tuple is a part of a WARM chain + */ #define HEAP_LATEST_TUPLE 0x1000 /* * This is the last tuple in chain and * ip_posid points to the root line @@ -271,7 +272,7 @@ struct HeapTupleHeaderData #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xF000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xF800 /* visibility-related bits */ /* @@ -510,6 +511,21 @@ do { \ (tup)->t_infomask2 & HEAP_ONLY_TUPLE \ ) +#define HeapTupleHeaderSetHeapWarmTuple(tup) \ +do { \ + (tup)->t_infomask2 |= HEAP_WARM_TUPLE; \ +} while (0) + +#define HeapTupleHeaderClearHeapWarmTuple(tup) \ +do { \ + (tup)->t_infomask2 &= ~HEAP_WARM_TUPLE; \ +} while (0) + +#define HeapTupleHeaderIsHeapWarmTuple(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_WARM_TUPLE) \ +) + #define HeapTupleHeaderSetHeapLatest(tup) \ ( \ (tup)->t_infomask2 |= HEAP_LATEST_TUPLE \ @@ -771,6 +787,15 @@ struct MinimalTupleData #define HeapTupleClearHeapOnly(tuple) \ HeapTupleHeaderClearHeapOnly((tuple)->t_data) +#define HeapTupleIsHeapWarmTuple(tuple) \ + HeapTupleHeaderIsHeapWarmTuple((tuple)->t_data) + +#define HeapTupleSetHeapWarmTuple(tuple) \ + HeapTupleHeaderSetHeapWarmTuple((tuple)->t_data) + +#define HeapTupleClearHeapWarmTuple(tuple) \ + HeapTupleHeaderClearHeapWarmTuple((tuple)->t_data) + #define HeapTupleGetOid(tuple) \ HeapTupleHeaderGetOid((tuple)->t_data) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index c580f51..83af072 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -751,6 +751,8 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); +extern bool btrecheck(Relation indexRel, IndexTuple indexTuple, + Relation heapRel, HeapTuple heapTuple); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 49c2a6f..880e62e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -110,7 +110,8 @@ typedef struct IndexScanDescData HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ - bool xs_recheck; /* T means scan keys must be rechecked */ + bool xs_recheck; /* T means scan keys must be rechecked for each tuple */ + bool xs_tuple_recheck; /* T means scan keys must be rechecked for current tuple */ /* * When fetching with an ordering operator, the values of the ORDER BY diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 39521ed..60a5445 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -366,6 +366,7 @@ extern void UnregisterExprContextCallback(ExprContext *econtext, extern void ExecOpenIndices(ResultRelInfo *resultRelInfo, bool speculative); extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); extern List *ExecInsertIndexTuples(TupleTableSlot *slot, ItemPointer tupleid, + ItemPointer root_tid, Bitmapset *updated_attrs, EState *estate, bool noDupErr, bool *specConflict, List *arbiterIndexes); extern bool ExecCheckIndexConstraints(TupleTableSlot *slot, EState *estate, diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h index 194fadb..fe9c78e 100644 --- a/src/include/executor/nodeIndexscan.h +++ b/src/include/executor/nodeIndexscan.h @@ -38,4 +38,5 @@ extern bool ExecIndexEvalArrayKeys(ExprContext *econtext, IndexArrayKeyInfo *arrayKeys, int numArrayKeys); extern bool ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys); +extern bool indexscan_recheck; #endif /* NODEINDEXSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index a4ea1b9..42f8ecf 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -60,6 +60,7 @@ typedef struct IndexInfo NodeTag type; int ii_NumIndexAttrs; AttrNumber ii_KeyAttrNumbers[INDEX_MAX_KEYS]; + Bitmapset *ii_indxattrs; /* bitmap of all columns used in this index */ List *ii_Expressions; /* list of Expr */ List *ii_ExpressionsState; /* list of ExprState */ List *ii_Predicate; /* list of Expr */ diff --git a/src/include/storage/itemid.h b/src/include/storage/itemid.h index 509c577..8c9cc99 100644 --- a/src/include/storage/itemid.h +++ b/src/include/storage/itemid.h @@ -46,6 +46,12 @@ typedef ItemIdData *ItemId; typedef uint16 ItemOffset; typedef uint16 ItemLength; +/* + * Special value used in lp_len to indicate that the chain starting at line + * pointer may contain WARM tuples. This must only be interpreted along with + * LP_REDIRECT flag + */ +#define SpecHeapWarmLen 0x1ffb /* ---------------- * support macros @@ -112,12 +118,15 @@ typedef uint16 ItemLength; #define ItemIdIsDead(itemId) \ ((itemId)->lp_flags == LP_DEAD) +#define ItemIdIsHeapWarm(itemId) \ + (((itemId)->lp_flags == LP_REDIRECT) && \ + ((itemId)->lp_len == SpecHeapWarmLen)) /* * ItemIdHasStorage * True iff item identifier has associated storage. */ #define ItemIdHasStorage(itemId) \ - ((itemId)->lp_len != 0) + (!ItemIdIsRedirected(itemId) && (itemId)->lp_len != 0) /* * ItemIdSetUnused @@ -168,6 +177,26 @@ typedef uint16 ItemLength; ) /* + * ItemIdSetHeapWarm + * Set the item identifier to identify as starting of a WARM chain + * + * Note: Since all bits in lp_flags are currently used, we store a special + * value in lp_len field to indicate this state. This is required only for + * LP_REDIRECT tuple and lp_len field is unused for such line pointers. + */ +#define ItemIdSetHeapWarm(itemId) \ +do { \ + AssertMacro((itemId)->lp_flags == LP_REDIRECT); \ + (itemId)->lp_len = SpecHeapWarmLen; \ +} while (0) + +#define ItemIdClearHeapWarm(itemId) \ +( \ + AssertMacro((itemId)->lp_flags == LP_REDIRECT); \ + (itemId)->lp_len = 0; \ +) + +/* * ItemIdMarkDead * Set the item identifier to be DEAD, keeping its existing storage. * diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index ed14442..dac32b5 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -101,8 +101,11 @@ typedef struct RelationData /* data managed by RelationGetIndexAttrBitmap: */ Bitmapset *rd_indexattr; /* identifies columns used in indexes */ + Bitmapset *rd_exprindexattr; /* indentified columns used in expression or + predicate indexes */ Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */ Bitmapset *rd_idattr; /* included in replica identity index */ + bool rd_supportswarm;/* True if the table can be WARM updated */ /* * rd_options is set whenever rd_rel is loaded into the relcache entry. diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 6ea7dd2..290e9b7 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -48,7 +48,8 @@ typedef enum IndexAttrBitmapKind { INDEX_ATTR_BITMAP_ALL, INDEX_ATTR_BITMAP_KEY, - INDEX_ATTR_BITMAP_IDENTITY_KEY + INDEX_ATTR_BITMAP_IDENTITY_KEY, + INDEX_ATTR_BITMAP_EXPR_PREDICATE } IndexAttrBitmapKind; extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation,