diff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml
index f8c9655111..ab73463b53 100644
--- a/doc/src/sgml/mvcc.sgml
+++ b/doc/src/sgml/mvcc.sgml
@@ -238,7 +238,7 @@
Not possible
- Allowed, but not in PG
+ Allowed, but not generally useful
Possible
@@ -268,11 +268,10 @@
In PostgreSQL, you can request any of
- the four standard transaction isolation levels, but internally only
- three distinct isolation levels are implemented, i.e. PostgreSQL's
- Read Uncommitted mode behaves like Read Committed. This is because
- it is the only sensible way to map the standard isolation levels to
- PostgreSQL's multiversion concurrency control architecture.
+ the four standard transaction isolation levels, but Read Uncommitted
+ is mostly irrelevant because there are few use cases where this
+ isolation level does anything useful for applications in comparison
+ with PostgreSQL's multiversion concurrency control architecture.
@@ -784,6 +783,54 @@ ERROR: could not serialize access due to read/write dependencies among transact
+
+
+ Read Uncommitted Isolation Level
+
+
+ transaction isolation level
+ read uncommitted
+
+
+
+ read uncommitted
+
+
+
+ In PostgreSQL's MVCC
+ architecture, readers are not blocked by writers, so in general
+ you should have no need for this transaction isolation level.
+
+
+
+ In general, read uncommitted will return inconsistent results and
+ wrong answers. If you look at the changes made by a transaction
+ while it continues to make changes then you may get partial results
+ from queries, or you may miss index entries that haven't yet been
+ written. However, if you are reading transactions that are paused
+ at the end of their execution for whatever reason then you can
+ see a consistent result.
+
+
+
+ The main use case for this transaction isolation level is for
+ investigating or recovering data. Examples of this would be when
+ inspecting the writes made by a locked or hanging transaction, when
+ you are running queries on a standby node that is currently paused,
+ such as when a standby node has halted at a recovery target with
+ recovery_target_inclusive = false or when you
+ need to inspect changes made by an in-doubt prepared transaction to
+ decide whether to commit or abort that transaction.
+
+
+
+ In PostgreSQL read uncommitted mode gives
+ a consistent snapshot of the currently running transactions at the
+ time the snapshot was taken. Transactions starting after that time
+ will not be visible, even though they are not yet committed.
+
+
+
diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
index 3e3646716f..d669a83418 100644
--- a/src/backend/access/heap/heapam_visibility.c
+++ b/src/backend/access/heap/heapam_visibility.c
@@ -1058,7 +1058,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
HeapTupleHeaderGetRawXmin(tuple));
- else
+ else if (XactIsoLevel != XACT_READ_UNCOMMITTED)
{
/* it must have aborted or crashed */
SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
@@ -1103,6 +1103,8 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
}
if (XidInMVCCSnapshot(xmax, snapshot))
return true;
+ if (XactIsoLevel == XACT_READ_UNCOMMITTED)
+ return false;
if (TransactionIdDidCommit(xmax))
return false; /* updating transaction committed */
/* it must have aborted or crashed */
@@ -1122,6 +1124,9 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot))
return true;
+ if (XactIsoLevel == XACT_READ_UNCOMMITTED)
+ return false;
+
if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
{
/* it must have aborted or crashed */
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 13bcbe77de..2335be5306 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1743,10 +1743,45 @@ GetSnapshotData(Snapshot snapshot)
RecentXmin = xmin;
snapshot->xmin = xmin;
- snapshot->xmax = xmax;
- snapshot->xcnt = count;
- snapshot->subxcnt = subcount;
- snapshot->suboverflowed = suboverflowed;
+ if (XactIsoLevel == XACT_READ_UNCOMMITTED)
+ {
+ /*
+ * In XACT_READ_UNCOMMITTED we want a consistent snapshot, just
+ * one that can see data written by transactions currently in
+ * progress. So any transactions started AFTER this point will
+ * still be invisible to us. We don't use the normal latest
+ * Committed+1 because that misses many currently executing xids.
+ * This is safe since we read the value atomically, so we
+ * don't need XidGenLock.
+ *
+ * This is a useful definition of a consistent snapshot when
+ * we want to see the effects of unresolved 2PC transactions
+ * or when recovery has paused. In other cases, transactions
+ * might continue to write and so the results might still be
+ * inconsistent in many cases; caveat emptor.
+ */
+ snapshot->xmax = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+
+ /*
+ * We still need to calculate xmin correctly, so we respect the
+ * normal limits for cleaning up as we scan. This is needed in
+ * recovery in case we want to keep using this snapshot after
+ * the standby is promoted.
+ *
+ * Other values must be zeroed otherwise the snapshot wouldn't
+ * be able to see the uncommitted transactions.
+ */
+ snapshot->xcnt = 0;
+ snapshot->subxcnt = 0;
+ snapshot->suboverflowed = false;
+ }
+ else
+ {
+ snapshot->xmax = xmax;
+ snapshot->xcnt = count;
+ snapshot->subxcnt = subcount;
+ snapshot->suboverflowed = suboverflowed;
+ }
snapshot->curcid = GetCurrentCommandId(false);