diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c index fb0013d..b2253b1 100644 --- a/src/backend/executor/execScan.c +++ b/src/backend/executor/execScan.c @@ -120,7 +120,8 @@ ExecScanFetch(ScanState *node, TupleTableSlot * ExecScan(ScanState *node, ExecScanAccessMtd accessMtd, /* function returning a tuple */ - ExecScanRecheckMtd recheckMtd) + ExecScanRecheckMtd recheckMtd, + ExecScanValidateMtd validateMtd) { ExprContext *econtext; List *qual; @@ -139,7 +140,7 @@ ExecScan(ScanState *node, * If we have neither a qual to check nor a projection to do, just skip * all the overhead and return the raw scan tuple. */ - if (!qual && !projInfo) + if (!qual && !projInfo && !validateMtd) { ResetExprContext(econtext); return ExecScanFetch(node, accessMtd, recheckMtd); @@ -205,8 +206,10 @@ ExecScan(ScanState *node, * when the qual is nil ... saves only a few cycles, but they add up * ... */ - if (!qual || ExecQual(qual, econtext, false)) + if ((!qual || ExecQual(qual, econtext, false)) + && (!validateMtd || (*validateMtd)(node, slot))) { + /* * Found a satisfactory scan tuple. */ diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 449aacb..56e10af 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -439,7 +439,8 @@ ExecBitmapHeapScan(BitmapHeapScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) BitmapHeapNext, - (ExecScanRecheckMtd) BitmapHeapRecheck); + (ExecScanRecheckMtd) BitmapHeapRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeCtescan.c b/src/backend/executor/nodeCtescan.c index 162650a..f0cd837 100644 --- a/src/backend/executor/nodeCtescan.c +++ b/src/backend/executor/nodeCtescan.c @@ -154,7 +154,8 @@ ExecCteScan(CteScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) CteScanNext, - (ExecScanRecheckMtd) CteScanRecheck); + (ExecScanRecheckMtd) CteScanRecheck, + NULL); } diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c index d886aaf..f0a4836 100644 --- a/src/backend/executor/nodeForeignscan.c +++ b/src/backend/executor/nodeForeignscan.c @@ -118,7 +118,8 @@ ExecForeignScan(ForeignScanState *node) { return ExecScan((ScanState *) node, (ExecScanAccessMtd) ForeignNext, - (ExecScanRecheckMtd) ForeignRecheck); + (ExecScanRecheckMtd) ForeignRecheck, + NULL); } diff --git a/src/backend/executor/nodeFunctionscan.c b/src/backend/executor/nodeFunctionscan.c index 5a0f324..66dce21 100644 --- a/src/backend/executor/nodeFunctionscan.c +++ b/src/backend/executor/nodeFunctionscan.c @@ -267,7 +267,8 @@ ExecFunctionScan(FunctionScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) FunctionNext, - (ExecScanRecheckMtd) FunctionRecheck); + (ExecScanRecheckMtd) FunctionRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 4f6f91c..ee87c64 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -38,6 +38,7 @@ static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc); +static void IndexOnlyCheckVisibility(IndexOnlyScanState *node, TupleTableSlot *slot); /* ---------------------------------------------------------------- @@ -78,71 +79,6 @@ IndexOnlyNext(IndexOnlyScanState *node) */ while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { - HeapTuple tuple = NULL; - - /* - * We can skip the heap fetch if the TID references a heap page on - * which all tuples are known visible to everybody. In any case, - * we'll use the index tuple not the heap tuple as the data source. - * - * Note on Memory Ordering Effects: visibilitymap_get_status does not - * lock the visibility map buffer, and therefore the result we read - * here could be slightly stale. However, it can't be stale enough to - * matter. - * - * We need to detect clearing a VM bit due to an insert right away, - * because the tuple is present in the index page but not visible. The - * reading of the TID by this scan (using a shared lock on the index - * buffer) is serialized with the insert of the TID into the index - * (using an exclusive lock on the index buffer). Because the VM bit - * is cleared before updating the index, and locking/unlocking of the - * index page acts as a full memory barrier, we are sure to see the - * cleared bit if we see a recently-inserted TID. - * - * Deletes do not update the index page (only VACUUM will clear out - * the TID), so the clearing of the VM bit by a delete is not - * serialized with this test below, and we may see a value that is - * significantly stale. However, we don't care about the delete right - * away, because the tuple is still visible until the deleting - * transaction commits or the statement ends (if it's our - * transaction). In either case, the lock on the VM buffer will have - * been released (acting as a write barrier) after clearing the bit. - * And for us to have a snapshot that includes the deleting - * transaction (making the tuple invisible), we must have acquired - * ProcArrayLock after that time, acting as a read barrier. - * - * It's worth going through this complexity to avoid needing to lock - * the VM buffer, which could cause significant contention. - */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - node->ioss_HeapFetches++; - tuple = index_fetch_heap(scandesc); - if (tuple == NULL) - continue; /* no visible tuple, try next index entry */ - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_continue_hot) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); - - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - } - /* * Fill the scan tuple slot with data from the index. */ @@ -181,14 +117,11 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * Predicate locks for index-only scans must be acquired at the page * level when the heap is not accessed, since tuple-level predicate - * locks need the tuple's xmin value. If we had to visit the tuple - * anyway, then we already have the tuple-level lock and can skip the - * page lock. + * locks need the tuple's xmin value. */ - if (tuple == NULL) - PredicateLockPage(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - estate->es_snapshot); + PredicateLockPage(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + estate->es_snapshot); return slot; } @@ -200,6 +133,79 @@ IndexOnlyNext(IndexOnlyScanState *node) return ExecClearTuple(slot); } +static void IndexOnlyCheckVisibility(IndexOnlyScanState *node, TupleTableSlot *slot) +{ + IndexScanDesc scandesc; + ItemPointer tid; + + scandesc = node->ioss_ScanDesc; + tid = &(scandesc->xs_ctup.t_self); + + /* + * We can skip the heap fetch if the TID references a heap page on + * which all tuples are known visible to everybody. In any case, + * we'll use the index tuple not the heap tuple as the data source. + * + * Note on Memory Ordering Effects: visibilitymap_get_status does not + * lock the visibility map buffer, and therefore the result we read + * here could be slightly stale. However, it can't be stale enough to + * matter. + * + * We need to detect clearing a VM bit due to an insert right away, + * because the tuple is present in the index page but not visible. The + * reading of the TID by this scan (using a shared lock on the index + * buffer) is serialized with the insert of the TID into the index + * (using an exclusive lock on the index buffer). Because the VM bit + * is cleared before updating the index, and locking/unlocking of the + * index page acts as a full memory barrier, we are sure to see the + * cleared bit if we see a recently-inserted TID. + * + * Deletes do not update the index page (only VACUUM will clear out + * the TID), so the clearing of the VM bit by a delete is not + * serialized with this test below, and we may see a value that is + * significantly stale. However, we don't care about the delete right + * away, because the tuple is still visible until the deleting + * transaction commits or the statement ends (if it's our + * transaction). In either case, the lock on the VM buffer will have + * been released (acting as a write barrier) after clearing the bit. + * And for us to have a snapshot that includes the deleting + * transaction (making the tuple invisible), we must have acquired + * ProcArrayLock after that time, acting as a read barrier. + * + * It's worth going through this complexity to avoid needing to lock + * the VM buffer, which could cause significant contention. + */ + if (!VM_ALL_VISIBLE(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->ioss_VMBuffer)) + { + /* + * Rats, we have to visit the heap to check visibility. + */ + node->ioss_HeapFetches++; + HeapTuple tuple = index_fetch_heap(scandesc); + if (tuple == NULL) + return false; + + /* + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. If we did want to allow that, we'd need to keep + * more state to remember not to call index_getnext_tid next time. + */ + if (scandesc->xs_continue_hot) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, as + * recorded in scandesc->xs_cbuf. We could release that pin now, + * but it's not clear whether it's a win to do so. The next index + * entry might require a visit to the same heap page. + */ + } + return true; +} + /* * StoreIndexTuple * Fill the slot with data from the index tuple. @@ -260,7 +266,8 @@ ExecIndexOnlyScan(IndexOnlyScanState *node) return ExecScan(&node->ss, (ExecScanAccessMtd) IndexOnlyNext, - (ExecScanRecheckMtd) IndexOnlyRecheck); + (ExecScanRecheckMtd) IndexOnlyRecheck, + (ExecScanValidateMtd) IndexOnlyCheckVisibility); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 3143bd9..d15946e 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -494,11 +494,13 @@ ExecIndexScan(IndexScanState *node) if (node->iss_NumOrderByKeys > 0) return ExecScan(&node->ss, (ExecScanAccessMtd) IndexNextWithReorder, - (ExecScanRecheckMtd) IndexRecheck); + (ExecScanRecheckMtd) IndexRecheck, + NULL); else return ExecScan(&node->ss, (ExecScanAccessMtd) IndexNext, - (ExecScanRecheckMtd) IndexRecheck); + (ExecScanRecheckMtd) IndexRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 9ce7c02..f89d1d4 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -100,7 +100,8 @@ ExecSampleScan(SampleScanState *node) { return ExecScan((ScanState *) node, (ExecScanAccessMtd) SampleNext, - (ExecScanRecheckMtd) SampleRecheck); + (ExecScanRecheckMtd) SampleRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 00bf3a5..716b85c 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -126,7 +126,8 @@ ExecSeqScan(SeqScanState *node) { return ExecScan((ScanState *) node, (ExecScanAccessMtd) SeqNext, - (ExecScanRecheckMtd) SeqRecheck); + (ExecScanRecheckMtd) SeqRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeSubqueryscan.c b/src/backend/executor/nodeSubqueryscan.c index 9bafc62..a684042 100644 --- a/src/backend/executor/nodeSubqueryscan.c +++ b/src/backend/executor/nodeSubqueryscan.c @@ -84,7 +84,8 @@ ExecSubqueryScan(SubqueryScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) SubqueryNext, - (ExecScanRecheckMtd) SubqueryRecheck); + (ExecScanRecheckMtd) SubqueryRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index d54fe36..bdb83ee 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -392,7 +392,8 @@ ExecTidScan(TidScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) TidNext, - (ExecScanRecheckMtd) TidRecheck); + (ExecScanRecheckMtd) TidRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeValuesscan.c b/src/backend/executor/nodeValuesscan.c index 9c03f8a..fbaf2c2 100644 --- a/src/backend/executor/nodeValuesscan.c +++ b/src/backend/executor/nodeValuesscan.c @@ -191,7 +191,8 @@ ExecValuesScan(ValuesScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) ValuesNext, - (ExecScanRecheckMtd) ValuesRecheck); + (ExecScanRecheckMtd) ValuesRecheck, + NULL); } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeWorktablescan.c b/src/backend/executor/nodeWorktablescan.c index cfed6e6..1f4364d 100644 --- a/src/backend/executor/nodeWorktablescan.c +++ b/src/backend/executor/nodeWorktablescan.c @@ -116,7 +116,8 @@ ExecWorkTableScan(WorkTableScanState *node) return ExecScan(&node->ss, (ExecScanAccessMtd) WorkTableScanNext, - (ExecScanRecheckMtd) WorkTableScanRecheck); + (ExecScanRecheckMtd) WorkTableScanRecheck, + NULL); } diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 136276b..6a548c4 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -254,9 +254,10 @@ extern TupleTableSlot *ExecProject(ProjectionInfo *projInfo, */ typedef TupleTableSlot *(*ExecScanAccessMtd) (ScanState *node); typedef bool (*ExecScanRecheckMtd) (ScanState *node, TupleTableSlot *slot); +typedef bool (*ExecScanValidateMtd) (ScanState *node, TupleTableSlot *slot); extern TupleTableSlot *ExecScan(ScanState *node, ExecScanAccessMtd accessMtd, - ExecScanRecheckMtd recheckMtd); + ExecScanRecheckMtd recheckMtd, ExecScanValidateMtd validateMtd); extern void ExecAssignScanProjectionInfo(ScanState *node); extern void ExecAssignScanProjectionInfoWithVarno(ScanState *node, Index varno); extern void ExecScanReScan(ScanState *node);