diff --git a/contrib/pgrowlocks/Makefile b/contrib/pgrowlocks/Makefile index f56389b..fe80423 100644 --- a/contrib/pgrowlocks/Makefile +++ b/contrib/pgrowlocks/Makefile @@ -4,7 +4,7 @@ MODULE_big = pgrowlocks OBJS = pgrowlocks.o EXTENSION = pgrowlocks -DATA = pgrowlocks--1.0.sql pgrowlocks--unpackaged--1.0.sql +DATA = pgrowlocks--1.1.sql pgrowlocks--1.0--1.1.sql pgrowlocks--unpackaged--1.0.sql ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql new file mode 100644 index 0000000..70f20c7 --- /dev/null +++ b/contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql @@ -0,0 +1,18 @@ +/* contrib/pgrowlocks/pgrowlocks--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit + +ALTER EXTENSION pgrowlocks DROP FUNCTION pgrowlocks(text); +DROP FUNCTION pgrowlocks(text); +CREATE FUNCTION pgrowlocks(IN relname text, + OUT locked_row TID, -- row TID + OUT lock_type TEXT, -- lock type + OUT locker XID, -- locking XID + OUT multi bool, -- multi XID? + OUT xids xid[], -- multi XIDs + OUT modes text[], -- multi XID statuses + OUT pids INTEGER[]) -- locker's process id +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pgrowlocks' +LANGUAGE C STRICT; diff --git a/contrib/pgrowlocks/pgrowlocks--1.0.sql b/contrib/pgrowlocks/pgrowlocks--1.0.sql deleted file mode 100644 index a909b74..0000000 --- a/contrib/pgrowlocks/pgrowlocks--1.0.sql +++ /dev/null @@ -1,15 +0,0 @@ -/* contrib/pgrowlocks/pgrowlocks--1.0.sql */ - --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit - -CREATE FUNCTION pgrowlocks(IN relname text, - OUT locked_row TID, -- row TID - OUT lock_type TEXT, -- lock type - OUT locker XID, -- locking XID - OUT multi bool, -- multi XID? - OUT xids xid[], -- multi XIDs - OUT pids INTEGER[]) -- locker's process id -RETURNS SETOF record -AS 'MODULE_PATHNAME', 'pgrowlocks' -LANGUAGE C STRICT; diff --git a/contrib/pgrowlocks/pgrowlocks--1.1.sql b/contrib/pgrowlocks/pgrowlocks--1.1.sql new file mode 100644 index 0000000..924d80f --- /dev/null +++ b/contrib/pgrowlocks/pgrowlocks--1.1.sql @@ -0,0 +1,16 @@ +/* contrib/pgrowlocks/pgrowlocks--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pgrowlocks" to load this file. \quit + +CREATE FUNCTION pgrowlocks(IN relname text, + OUT locked_row TID, -- row TID + OUT lock_type TEXT, -- lock type + OUT locker XID, -- locking XID + OUT multi bool, -- multi XID? + OUT xids xid[], -- multi XIDs + OUT modes text[], -- multi XID statuses + OUT pids INTEGER[]) -- locker's process id +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pgrowlocks' +LANGUAGE C STRICT; diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c index 20beed2..170547a 100644 --- a/contrib/pgrowlocks/pgrowlocks.c +++ b/contrib/pgrowlocks/pgrowlocks.c @@ -59,6 +59,14 @@ typedef struct int ncolumns; } MyData; +#define Atnum_tid 0 +#define Atnum_type 1 +#define Atnum_xmax 2 +#define Atnum_ismulti 3 +#define Atnum_xids 4 +#define Atnum_modes 5 +#define Atnum_pids 6 + Datum pgrowlocks(PG_FUNCTION_ARGS) { @@ -124,72 +132,96 @@ pgrowlocks(PG_FUNCTION_ARGS) GetCurrentCommandId(false), scan->rs_cbuf) == HeapTupleBeingUpdated) { - char **values; - int i; values = (char **) palloc(mydata->ncolumns * sizeof(char *)); - i = 0; - values[i++] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self)); + values[Atnum_tid] = (char *) DirectFunctionCall1(tidout, PointerGetDatum(&tuple->t_self)); - if (tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK) - values[i++] = pstrdup("Shared"); - else - values[i++] = pstrdup("Exclusive"); - values[i] = palloc(NCHARS * sizeof(char)); - snprintf(values[i++], NCHARS, "%d", HeapTupleHeaderGetXmax(tuple->t_data)); + values[Atnum_type] = palloc(36); + values[Atnum_type][0] = '\0'; + if (tuple->t_data->t_infomask & HEAP_XMAX_KEYSHR_LOCK) + strcat(values[Atnum_type], "KeyShare "); + if (tuple->t_data->t_infomask & HEAP_XMAX_EXCL_LOCK) + strcat(values[Atnum_type], "Exclusive "); + if (tuple->t_data->t_infomask & HEAP_XMAX_IS_NOT_UPDATE) + strcat(values[Atnum_type], "IsNotUpdate "); + + values[Atnum_xmax] = palloc(NCHARS * sizeof(char)); + snprintf(values[Atnum_xmax], NCHARS, "%d", HeapTupleHeaderGetXmax(tuple->t_data)); if (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) { - TransactionId *xids; - int nxids; + MultiXactMember *members; + int nmembers; int j; - int isValidXid = 0; /* any valid xid ever exists? */ + bool isValidXid = false; /* any valid xid ever exists? */ - values[i++] = pstrdup("true"); - nxids = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple->t_data), &xids); - if (nxids == -1) - { + values[Atnum_ismulti] = pstrdup("true"); + + nmembers = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple->t_data), &members); + if (nmembers == -1) elog(ERROR, "GetMultiXactIdMembers returns error"); - } - values[i] = palloc(NCHARS * nxids); - values[i + 1] = palloc(NCHARS * nxids); - strcpy(values[i], "{"); - strcpy(values[i + 1], "{"); + values[Atnum_xids] = palloc(NCHARS * nmembers); + values[Atnum_modes] = palloc(NCHARS * nmembers); + values[Atnum_pids] = palloc(NCHARS * nmembers); - for (j = 0; j < nxids; j++) + strcpy(values[Atnum_xids], "{"); + strcpy(values[Atnum_modes], "{"); + strcpy(values[Atnum_pids], "{"); + + for (j = 0; j < nmembers; j++) { char buf[NCHARS]; - if (TransactionIdIsInProgress(xids[j])) + if (isValidXid) { - if (isValidXid) - { - strcat(values[i], ","); - strcat(values[i + 1], ","); - } - snprintf(buf, NCHARS, "%d", xids[j]); - strcat(values[i], buf); - snprintf(buf, NCHARS, "%d", BackendXidGetPid(xids[j])); - strcat(values[i + 1], buf); - - isValidXid = 1; + strcat(values[Atnum_xids], ","); + strcat(values[Atnum_modes], ","); + strcat(values[Atnum_pids], ","); + } + snprintf(buf, NCHARS, "%d", members[j].xid); + strcat(values[Atnum_xids], buf); + switch (members[j].status) + { + case MultiXactStatusKeyUpdate: + snprintf(buf, NCHARS, "keyupd"); + break; + case MultiXactStatusUpdate: + snprintf(buf, NCHARS, "upd"); + break; + case MultiXactStatusForUpdate: + snprintf(buf, NCHARS, "forupd"); + break; + case MultiXactStatusForShare: + snprintf(buf, NCHARS, "shr"); + break; + case MultiXactStatusForKeyShare: + snprintf(buf, NCHARS, "keyshr"); + break; } + strcat(values[Atnum_modes], buf); + snprintf(buf, NCHARS, "%d", BackendXidGetPid(members[j].xid)); + strcat(values[Atnum_pids], buf); + + isValidXid = true; } - strcat(values[i], "}"); - strcat(values[i + 1], "}"); - i++; + strcat(values[Atnum_xids], "}"); + strcat(values[Atnum_modes], "}"); + strcat(values[Atnum_pids], "}"); } else { - values[i++] = pstrdup("false"); - values[i] = palloc(NCHARS * sizeof(char)); - snprintf(values[i++], NCHARS, "{%d}", HeapTupleHeaderGetXmax(tuple->t_data)); + values[Atnum_ismulti] = pstrdup("false"); + + values[Atnum_xids] = palloc(NCHARS * sizeof(char)); + snprintf(values[Atnum_xids], NCHARS, "{%d}", HeapTupleHeaderGetXmax(tuple->t_data)); + + values[Atnum_modes] = NULL; - values[i] = palloc(NCHARS * sizeof(char)); - snprintf(values[i++], NCHARS, "{%d}", BackendXidGetPid(HeapTupleHeaderGetXmax(tuple->t_data))); + values[Atnum_pids] = palloc(NCHARS * sizeof(char)); + snprintf(values[Atnum_pids], NCHARS, "{%d}", BackendXidGetPid(HeapTupleHeaderGetXmax(tuple->t_data))); } LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); @@ -200,10 +232,10 @@ pgrowlocks(PG_FUNCTION_ARGS) /* make the tuple into a datum */ result = HeapTupleGetDatum(tuple); - /* Clean up */ - for (i = 0; i < mydata->ncolumns; i++) - pfree(values[i]); - pfree(values); + /* + * no need to pfree what we allocated; it's on a short-lived memory + * context anyway + */ SRF_RETURN_NEXT(funcctx, result); } diff --git a/contrib/pgrowlocks/pgrowlocks.control b/contrib/pgrowlocks/pgrowlocks.control index a6ba164..dfa587d 100644 --- a/contrib/pgrowlocks/pgrowlocks.control +++ b/contrib/pgrowlocks/pgrowlocks.control @@ -1,5 +1,5 @@ # pgrowlocks extension comment = 'show row-level locking information' -default_version = '1.0' +default_version = '1.1' module_pathname = '$libdir/pgrowlocks' relocatable = true diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b2d1901..42d14a2 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -74,6 +74,7 @@ bool synchronize_seqscans = true; +static LOCKMODE get_lockmode_for_tuplelock(LockTupleMode mode); static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, @@ -84,6 +85,7 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, bool all_visible_cleared, bool new_all_visible_cleared); static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs, HeapTuple oldtup, HeapTuple newtup); +static uint16 GetMultiXactIdHintBits(MultiXactId multi); /* ---------------------------------------------------------------- @@ -1620,7 +1622,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid); at_chain_start = false; - prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); + prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); } else break; /* end of chain */ @@ -1743,7 +1745,7 @@ heap_get_latest_tid(Relation relation, * tuple. Check for XMIN match. */ if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) { UnlockReleaseBuffer(buffer); break; @@ -1761,7 +1763,8 @@ heap_get_latest_tid(Relation relation, /* * If there's a valid t_ctid link, follow it, else we're done. */ - if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) || + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsLocked(tp.t_data) || ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) { UnlockReleaseBuffer(buffer); @@ -1769,7 +1772,7 @@ heap_get_latest_tid(Relation relation, } ctid = tp.t_data->t_ctid; - priorXmax = HeapTupleHeaderGetXmax(tp.t_data); + priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); UnlockReleaseBuffer(buffer); } /* end of loop */ } @@ -2085,10 +2088,11 @@ simple_heap_insert(Relation relation, HeapTuple tup) * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated * (the last only possible if wait == false). * - * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * In the failure cases, the routine returns the tuple's t_ctid and the + * updating Xid (resolving a possible MultiXact, if necessary). * If t_ctid is the same as tid, the tuple was deleted; if different, the * tuple was updated, and t_ctid is the location of the replacement tuple. - * (t_xmax is needed to verify that the replacement tuple matches.) + * (xmax is needed to verify that the replacement tuple matches.) */ HTSU_Result heap_delete(Relation relation, ItemPointer tid, @@ -2174,20 +2178,22 @@ l1: */ if (!have_tuple_lock) { - LockTuple(relation, &(tp.t_self), ExclusiveLock); + LockTuple(relation, &(tp.t_self), + get_lockmode_for_tuplelock(LockTupleKeyUpdate)); have_tuple_lock = true; } /* * Sleep until concurrent transaction ends. Note that we don't care - * if the locker has an exclusive or shared lock, because we need - * exclusive. + * which lock mode the locker has, because we need the strongest one. */ if (infomask & HEAP_XMAX_IS_MULTI) { + int remain; + /* wait for multixact */ - MultiXactIdWait((MultiXactId) xwait); + MultiXactIdWait((MultiXactId) xwait, MultiXactStatusKeyUpdate, &remain); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* @@ -2234,8 +2240,8 @@ l1: * We may overwrite if previous xmax aborted, or if it committed but * only locked the tuple without updating it. */ - if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID | - HEAP_IS_LOCKED)) + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsLocked(tp.t_data)) result = HeapTupleMayBeUpdated; else result = HeapTupleUpdated; @@ -2255,10 +2261,11 @@ l1: result == HeapTupleBeingUpdated); Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = tp.t_data->t_ctid; - *update_xmax = HeapTupleHeaderGetXmax(tp.t_data); + *update_xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); UnlockReleaseBuffer(buffer); if (have_tuple_lock) - UnlockTuple(relation, &(tp.t_self), ExclusiveLock); + UnlockTuple(relation, &(tp.t_self), + get_lockmode_for_tuplelock(LockTupleKeyUpdate)); if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); return result; @@ -2296,7 +2303,7 @@ l1: tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); HeapTupleHeaderClearHotUpdated(tp.t_data); HeapTupleHeaderSetXmax(tp.t_data, xid); @@ -2368,7 +2375,8 @@ l1: * Release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) - UnlockTuple(relation, &(tp.t_self), ExclusiveLock); + UnlockTuple(relation, &(tp.t_self), + get_lockmode_for_tuplelock(LockTupleKeyUpdate)); pgstat_count_heap_delete(relation); @@ -2442,10 +2450,11 @@ simple_heap_delete(Relation relation, ItemPointer tid) * update was done. However, any TOAST changes in the new tuple's * data are not reflected into *newtup. * - * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * In the failure cases, the routine returns the tuple's t_ctid and the + * updating Xid (resolving a possible MultiXact, if necessary). * If t_ctid is the same as otid, the tuple was deleted; if different, the * tuple was updated, and t_ctid is the location of the replacement tuple. - * (t_xmax is needed to verify that the replacement tuple matches.) + * (xmax is needed to verify that the replacement tuple matches.) */ HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, @@ -2455,11 +2464,14 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); Bitmapset *hot_attrs; + Bitmapset *key_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; Page page; BlockNumber block; + LockTupleMode tuplock; + MultiXactStatus mxact_status; Buffer buffer, newbuf, vmbuffer = InvalidBuffer, @@ -2471,8 +2483,14 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bool have_tuple_lock = false; bool iscombo; bool use_hot_update = false; + bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; + bool keep_xmax_multi = false; + TransactionId keep_xmax = InvalidTransactionId; + TransactionId keep_xmax_old = InvalidTransactionId; + uint16 keep_xmax_infomask = 0; + uint16 keep_xmax_old_infomask = 0; Assert(ItemPointerIsValid(otid)); @@ -2488,7 +2506,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, * Note that we get a copy here, so we need not worry about relcache flush * happening midway through. */ - hot_attrs = RelationGetIndexAttrBitmap(relation); + hot_attrs = RelationGetIndexAttrBitmap(relation, false); + key_attrs = RelationGetIndexAttrBitmap(relation, true); block = ItemPointerGetBlockNumber(otid); buffer = ReadBuffer(relation, block); @@ -2513,6 +2532,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_self = *otid; /* + * If we're not updating any "key" column, we can grab a milder lock type. + * This allows for more concurrency when we are running simultaneously with + * foreign key checks. + */ + if (HeapSatisfiesHOTUpdate(relation, key_attrs, &oldtup, newtup)) + { + tuplock = LockTupleUpdate; + mxact_status = MultiXactStatusUpdate; + key_intact = true; + } + else + { + tuplock = LockTupleKeyUpdate; + mxact_status = MultiXactStatusKeyUpdate; + key_intact = false; + } + + /* * Note: beyond this point, use oldtup not otid to refer to old tuple. * otid may very well point at newtup->t_self, which we will overwrite * with the new tuple's location, so there's great risk of confusion if we @@ -2522,6 +2559,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, l2: result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer); + /* see below about the "no wait" case */ + Assert(result != HeapTupleBeingUpdated || wait); + if (result == HeapTupleInvisible) { UnlockReleaseBuffer(buffer); @@ -2529,8 +2569,21 @@ l2: } else if (result == HeapTupleBeingUpdated && wait) { - TransactionId xwait; + TransactionId xwait; uint16 infomask; + bool none_remain = false; + + /* + * XXX note that we don't consider the "no wait" case here. This + * isn't a problem currently because no caller uses that case, but it + * should be fixed if such a caller is introduced. It wasn't a problem + * previously because this code would always wait, but now that some + * tuple locks do not conflict with one of the lock modes we use, it is + * possible that this case is interesting to handle specially. + * + * This may cause failures with third-party code that calls heap_update + * directly. + */ /* must copy state data before unlocking buffer */ xwait = HeapTupleHeaderGetXmax(oldtup.t_data); @@ -2549,20 +2602,26 @@ l2: */ if (!have_tuple_lock) { - LockTuple(relation, &(oldtup.t_self), ExclusiveLock); + LockTuple(relation, &(oldtup.t_self), + get_lockmode_for_tuplelock(tuplock)); have_tuple_lock = true; } /* - * Sleep until concurrent transaction ends. Note that we don't care - * if the locker has an exclusive or shared lock, because we need - * exclusive. + * Now sleep on the locker. Note that if there are only key-share + * lockers and we're not updating the key columns, we will be awaken + * before it is gone, so we may need to mark the new tuple with a + * new MultiXactId including the original xmax and ourselves. + * + * XXX this comment needs to be more comprehensive */ - if (infomask & HEAP_XMAX_IS_MULTI) { + TransactionId update_xact; + int remain; + /* wait for multixact */ - MultiXactIdWait((MultiXactId) xwait); + MultiXactIdWait((MultiXactId) xwait, mxact_status, &remain); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* @@ -2576,41 +2635,98 @@ l2: goto l2; /* - * You might think the multixact is necessarily done here, but not - * so: it could have surviving members, namely our own xact or - * other subxacts of this backend. It is legal for us to update - * the tuple in either case, however (the latter case is - * essentially a situation of upgrading our former shared lock to - * exclusive). We don't bother changing the on-disk hint bits - * since we are about to overwrite the xmax altogether. + * Note that the multixact may not be done by now. It could have + * surviving members; our own xact or other subxacts of this + * backend, and also any other concurrent transaction that locked + * the tuple with KeyShare if we only got TupleLockUpdate. If this + * is the case, we have to be careful to mark the updated tuple + * with the surviving members in Xmax. + * + * Note that there could have been another update in the MultiXact. + * In that case, we need to check whether it committed or aborted. + * If it aborted we are safe to update it again; otherwise there is + * an update conflict that must be handled below. + * + * In the LockTupleKeyUpdate case, we still need to preserve the + * surviving members: those would include the tuple locks we had + * before this one, which are important to keep in case this + * subxact aborts. */ + update_xact = InvalidTransactionId; + if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_NOT_UPDATE)) + update_xact = HeapTupleGetUpdateXid(oldtup.t_data); + + /* there was no UPDATE in the MultiXact; or it aborted. */ + if (update_xact == InvalidTransactionId || + TransactionIdDidAbort(update_xact)) + { + /* + * if the multixact still has live members, we need to preserve + * it by creating a new multixact. If all members are gone, we + * can simply update the tuple by setting ourselves in Xmax. + */ + if (remain > 0) + { + keep_xmax = HeapTupleHeaderGetXmax(oldtup.t_data); + keep_xmax_multi = + (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) != 0; + } + else + { + /* + * We could set the HEAP_XMAX_INVALID bit here instead of + * using a separate boolean flag. However, since we're going + * to set up a new xmax below, this would waste time + * setting up the buffer's dirty bit. + */ + none_remain = false; + } + } } else { - /* wait for regular transaction to end */ - XactLockTableWait(xwait); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. + * If it's just a key-share locker, and we're not changing the + * key columns, we don't need to wait for it to wait; but we + * need to preserve it as locker. */ - if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) || - !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data), - xwait)) - goto l2; + if ((oldtup.t_data->t_infomask & HEAP_XMAX_KEYSHR_LOCK) && + key_intact) + { + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + keep_xmax = xwait; + keep_xmax_multi = false; + } + else + { + /* wait for regular transaction to end */ + XactLockTableWait(xwait); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - /* Otherwise check if it committed or aborted */ - UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); + /* + * xwait is done, but if xwait had just locked the tuple then some + * other xact could update this tuple before we get to this point. + * Check for xmax change, and start over if so. + */ + if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data), + xwait)) + goto l2; + + /* Otherwise check if it committed or aborted */ + UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); + } } /* * We may overwrite if previous xmax aborted, or if it committed but - * only locked the tuple without updating it. + * only locked the tuple without updating it, or if we are going to + * keep it around in Xmax. */ - if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID | - HEAP_IS_LOCKED)) + if (TransactionIdIsValid(keep_xmax) || + none_remain || + (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsLocked(oldtup.t_data)) result = HeapTupleMayBeUpdated; else result = HeapTupleUpdated; @@ -2630,13 +2746,15 @@ l2: result == HeapTupleBeingUpdated); Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = oldtup.t_data->t_ctid; - *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data); + *update_xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); UnlockReleaseBuffer(buffer); if (have_tuple_lock) - UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); + UnlockTuple(relation, &(oldtup.t_self), + get_lockmode_for_tuplelock(tuplock)); if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); bms_free(hot_attrs); + bms_free(key_attrs); return result; } @@ -2645,7 +2763,7 @@ l2: * visible while we were busy locking the buffer, or during some subsequent * window during which we had it unlocked, we'll have to unlock and * re-lock, to avoid holding the buffer lock across an I/O. That's a bit - * unfortunate, esepecially since we'll now have to recheck whether the + * unfortunate, especially since we'll now have to recheck whether the * tuple has been locked or updated under us, but hopefully it won't * happen very often. */ @@ -2678,13 +2796,54 @@ l2: Assert(!(newtup->t_data->t_infomask & HEAP_HASOID)); } + /* + * If the tuple we're updating is locked, we need to preserve this in the + * new tuple's Xmax as well as in the old tuple. Prepare the new xmax + * value for these uses. + * + * Note there cannot be an xmax to save if we're changing key columns; in + * this case, the wait above should have only returned when the locking + * transactions finished. + */ + if (TransactionIdIsValid(keep_xmax)) + { + if (keep_xmax_multi) + { + keep_xmax_old = MultiXactIdExpand(keep_xmax, + xid, MultiXactStatusUpdate); + keep_xmax_infomask = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_IS_MULTI; + } + else + { + /* not a multi? must be a KEY SHARE locker */ + keep_xmax_old = MultiXactIdCreate(keep_xmax, MultiXactStatusForKeyShare, + xid, MultiXactStatusUpdate); + keep_xmax_infomask = HEAP_XMAX_KEYSHR_LOCK; + } + keep_xmax_old_infomask = HEAP_XMAX_IS_MULTI | HEAP_XMAX_KEYSHR_LOCK; + /* FIXME -- need more infomask bits? */ + } + + /* + * Prepare the new tuple with the appropriate initial values of Xmin and + * Xmax, as well as initial infomask bits. + */ newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); - newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED); + newtup->t_data->t_infomask |= HEAP_UPDATED; HeapTupleHeaderSetXmin(newtup->t_data, xid); HeapTupleHeaderSetCmin(newtup->t_data, cid); - HeapTupleHeaderSetXmax(newtup->t_data, 0); /* for cleanliness */ newtup->t_tableOid = RelationGetRelid(relation); + if (TransactionIdIsValid(keep_xmax)) + { + newtup->t_data->t_infomask |= keep_xmax_infomask; + HeapTupleHeaderSetXmax(newtup->t_data, keep_xmax); + } + else + { + newtup->t_data->t_infomask |= HEAP_XMAX_INVALID; + HeapTupleHeaderSetXmax(newtup->t_data, 0); /* for cleanliness */ + } /* * Replace cid with a combo cid if necessary. Note that we already put @@ -2725,11 +2884,20 @@ l2: oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~HEAP_UPDATE_KEY_INTACT; HeapTupleClearHotUpdated(&oldtup); /* ... and store info about transaction updating this tuple */ - HeapTupleHeaderSetXmax(oldtup.t_data, xid); + if (TransactionIdIsValid(keep_xmax_old)) + { + HeapTupleHeaderSetXmax(oldtup.t_data, keep_xmax_old); + oldtup.t_data->t_infomask |= keep_xmax_old_infomask; + } + else + HeapTupleHeaderSetXmax(oldtup.t_data, xid); + if (key_intact) + oldtup.t_data->t_infomask2 |= HEAP_UPDATE_KEY_INTACT; HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); /* temporarily make it look not-updated */ oldtup.t_data->t_ctid = oldtup.t_self; @@ -2883,10 +3051,19 @@ l2: oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~HEAP_UPDATE_KEY_INTACT; /* ... and store info about transaction updating this tuple */ - HeapTupleHeaderSetXmax(oldtup.t_data, xid); + if (TransactionIdIsValid(keep_xmax_old)) + { + HeapTupleHeaderSetXmax(oldtup.t_data, keep_xmax_old); + oldtup.t_data->t_infomask |= keep_xmax_old_infomask; + } + else + HeapTupleHeaderSetXmax(oldtup.t_data, xid); + if (key_intact) + oldtup.t_data->t_infomask2 |= HEAP_UPDATE_KEY_INTACT; HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); } @@ -2959,7 +3136,8 @@ l2: * Release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) - UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); + UnlockTuple(relation, &(oldtup.t_self), + get_lockmode_for_tuplelock(tuplock)); pgstat_count_heap_update(relation, use_hot_update); @@ -2974,6 +3152,7 @@ l2: } bms_free(hot_attrs); + bms_free(key_attrs); return HeapTupleMayBeUpdated; } @@ -3129,6 +3308,54 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) } /* + * Return the appropriate LOCKMODE to acquire by LockTuple corresponding to the + * given lock tuple mode. + * + * These heavyweight lock modes have been chosen because they exactly mimic + * the lock conflict behavior that our tuple lock modes need to have. + */ +static LOCKMODE +get_lockmode_for_tuplelock(LockTupleMode mode) +{ + switch (mode) + { + case LockTupleKeyShare: + return AccessShareLock; + case LockTupleShare: + return RowShareLock; + case LockTupleUpdate: + return ExclusiveLock; + case LockTupleKeyUpdate: + return AccessExclusiveLock; + default: + elog(ERROR, "invalid lock tuple mode %d", mode); + return 0; /* keep compiler quiet */ + } +} + +/* + * Return the MultiXactStatus corresponding to the given tuple lock mode. + */ +static MultiXactStatus +get_mxact_status_for_tuplelock(LockTupleMode mode) +{ + switch (mode) + { + case LockTupleKeyShare: + return MultiXactStatusForKeyShare; + case LockTupleShare: + return MultiXactStatusForShare; + case LockTupleUpdate: + return MultiXactStatusForUpdate; + case LockTupleKeyUpdate: + return MultiXactStatusUpdate; + default: + elog(ERROR, "invalid lock tuple mode %d", mode); + return 0; /* keep compiler quiet */ + } +} + +/* * heap_lock_tuple - lock a tuple in shared or exclusive mode * * Note that this acquires a buffer pin, which the caller must release. @@ -3152,10 +3379,11 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) * HeapTupleSelfUpdated: lock failed because tuple updated by self * HeapTupleUpdated: lock failed because tuple updated by other xact * - * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * In the failure cases, the routine returns the tuple's t_ctid and the + * updating Xid (resolving a possible MultiXact, if necessary). * If t_ctid is the same as t_self, the tuple was deleted; if different, the * tuple was updated, and t_ctid is the location of the replacement tuple. - * (t_xmax is needed to verify that the replacement tuple matches.) + * (xmax is needed to verify that the replacement tuple matches.) * * * NOTES: because the shared-memory lock table is of finite size, but users @@ -3201,13 +3429,13 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, Page page; TransactionId xid; TransactionId xmax; + TransactionId keep_xmax = InvalidTransactionId; + bool keep_xmax_multi = false; + bool none_remains = false; uint16 old_infomask; uint16 new_infomask; - LOCKMODE tuple_lock_type; bool have_tuple_lock = false; - tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock; - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); @@ -3220,6 +3448,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, tuple->t_tableOid = RelationGetRelid(relation); l3: + /* shouldn't get back here if we already set keep_xmax */ + Assert(keep_xmax == InvalidTransactionId); + result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer); if (result == HeapTupleInvisible) @@ -3231,30 +3462,70 @@ l3: { TransactionId xwait; uint16 infomask; + uint16 infomask2; + bool require_sleep; /* must copy state data before unlocking buffer */ xwait = HeapTupleHeaderGetXmax(tuple->t_data); infomask = tuple->t_data->t_infomask; + infomask2 = tuple->t_data->t_infomask2; LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); /* - * If we wish to acquire share lock, and the tuple is already - * share-locked by a multixact that includes any subtransaction of the - * current top transaction, then we effectively hold the desired lock - * already. We *must* succeed without trying to take the tuple lock, - * else we will deadlock against anyone waiting to acquire exclusive - * lock. We don't need to make any state changes in this case. + * If we wish to acquire share or key lock, and the tuple is already + * key or share locked by a multixact that includes any subtransaction + * of the current top transaction, then we effectively hold the desired + * lock already (except if we own key share lock and now desire share + * lock). We *must* succeed without trying to take the tuple lock, + * else we will deadlock against anyone wanting to acquire a stronger + * lock. + * + * FIXME -- we don't do the below currently, but I think we should: + * + * We update the Xmax with a new MultiXactId to include the new lock + * mode in this case. + * + * Note that since we want to alter the Xmax, we need to re-acquire the + * buffer lock. The xmax could have changed in the meantime, so we + * recheck it in that case, but we keep the buffer lock while doing it + * to prevent starvation. The second time around we know we must be + * part of the MultiXactId in any case, which is why we don't need to + * go back to recheck HeapTupleSatisfiesUpdate. Also, after we + * re-acquire lock, the MultiXact is likely to (but not necessarily) be + * the same that we see here, so it should be in multixact's cache and + * thus quick to obtain. */ - if (mode == LockTupleShared && - (infomask & HEAP_XMAX_IS_MULTI) && - MultiXactIdIsCurrent((MultiXactId) xwait)) + if ((infomask & HEAP_XMAX_IS_MULTI) && + ((mode == LockTupleShare) || (mode == LockTupleKeyShare))) { - Assert(infomask & HEAP_XMAX_SHARED_LOCK); - /* Probably can't hold tuple lock here, but may as well check */ - if (have_tuple_lock) - UnlockTuple(relation, tid, tuple_lock_type); - return HeapTupleMayBeUpdated; + int i; + int nmembers; + MultiXactMember *members; + + nmembers = GetMultiXactIdMembers(xwait, &members); + + for (i = 0; i < nmembers; i++) + { + if (TransactionIdIsCurrentTransactionId(members[i].xid)) + { + if ((mode == LockTupleKeyShare) || + ((mode == LockTupleShare) && + (members[i].status >= MultiXactStatusForShare))) + { + if (have_tuple_lock) + UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode)); + /* + * FIXME -- here we should lock buffer, update xmax, + * release buffer + */ + pfree(members); + return HeapTupleMayBeUpdated; + } + } + } + + pfree(members); } /* @@ -3270,105 +3541,240 @@ l3: { if (nowait) { - if (!ConditionalLockTuple(relation, tid, tuple_lock_type)) + if (!ConditionalLockTuple(relation, tid, get_lockmode_for_tuplelock(mode))) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); } else - LockTuple(relation, tid, tuple_lock_type); + LockTuple(relation, tid, get_lockmode_for_tuplelock(mode)); have_tuple_lock = true; } - if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK)) + /* + * If we're requesting KeyShare, and there's no update present, we + * don't need to wait for locking transaction(s) to finish. Even if + * there is an update, we can still continue if the key hasn't been + * modified. + */ + require_sleep = true; + if ((mode == LockTupleKeyShare) && + (HeapTupleHeaderInfomaskIsLocked(infomask) || + infomask2 & HEAP_UPDATE_KEY_INTACT)) { - /* - * Acquiring sharelock when there's at least one sharelocker - * already. We need not wait for him/them to complete. - */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); /* - * Make sure it's still a shared lock, else start over. (It's OK - * if the ownership of the shared lock has changed, though.) + * Make sure it's still an appropriate lock, else start over. */ - if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK)) + if (!(HeapTupleHeaderIsLocked(tuple->t_data) || + (tuple->t_data->t_infomask2 & HEAP_UPDATE_KEY_INTACT))) goto l3; + require_sleep = false; + /* acquire fresh values -- XXX do we need to restart if xmax changed? */ + keep_xmax = HeapTupleHeaderGetXmax(tuple->t_data); + keep_xmax_multi = (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) != 0; } - else if (infomask & HEAP_XMAX_IS_MULTI) - { - /* wait for multixact to end */ - if (nowait) - { - if (!ConditionalMultiXactIdWait((MultiXactId) xwait)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); - } - else - MultiXactIdWait((MultiXactId) xwait); + /* + * If we're requesting Share, we need to ensure there's no update + * and no exclusive lock present. + */ + if (mode == LockTupleShare && + (infomask & (HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_IS_NOT_UPDATE)) && + !(infomask & HEAP_XMAX_EXCL_LOCK)) + { LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); /* - * If xwait had just locked the tuple then some other xact could - * update this tuple before we get to this point. Check for xmax - * change, and start over if so. + * make sure it's still an appropriate lock, else start over. */ - if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || - !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data), - xwait)) + if (!(tuple->t_data->t_infomask & + (HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_IS_NOT_UPDATE)) || + (tuple->t_data->t_infomask & HEAP_XMAX_EXCL_LOCK)) goto l3; + require_sleep = false; + /* acquire fresh values */ + keep_xmax = HeapTupleHeaderGetXmax(tuple->t_data); + keep_xmax_multi = (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) != 0; + } - /* - * You might think the multixact is necessarily done here, but not - * so: it could have surviving members, namely our own xact or - * other subxacts of this backend. It is legal for us to lock the - * tuple in either case, however. We don't bother changing the - * on-disk hint bits since we are about to overwrite the xmax - * altogether. - */ + + /* + * If our lock is Update, we might also be able to skip the sleep; for + * this to be true, we need to ensure that there's no other lock type + * than KeyShare. + */ + if (mode == LockTupleUpdate) + { + if (infomask & HEAP_XMAX_IS_MULTI) + { + int nmembers; + MultiXactMember *members; + + /* + * This needs to be done the slow way: there might be + * MultiXactStatusForShare locks hiding in there, and there's + * no way to tell from just the hint bits. + */ + nmembers = GetMultiXactIdMembers(xwait, &members); + if (nmembers == 0) + { + require_sleep = false; + /* + * No need to keep the previous xmax here. Unlikely to + * happen anyway. + */ + } + else + { + int i; + bool allowed = true; + + for (i = 0; i < nmembers; i++) + { + if (members[i].status != MultiXactStatusForKeyShare) + { + allowed = false; + break; + } + } + if (allowed) + { + /* + * if the xmax changed under us in the meantime, start + * over. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data), + xwait)) + goto l3; + /* otherwise, we're good */ + require_sleep = false; + keep_xmax = xwait; + keep_xmax_multi = true; + } + } + } + else if (infomask & HEAP_XMAX_KEYSHR_LOCK) + { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* if the xmax changed in the meantime, start over */ + if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data), + xwait)) + goto l3; + /* otherwise, we're good */ + require_sleep = false; + keep_xmax = xwait; + keep_xmax_multi = false; + } } - else + + /* + * By here, we either require to wait for the locking transaction or + * multixact, or have already acquired the buffer exclusive lock. + */ + + if (require_sleep) { - /* wait for regular transaction to end */ - if (nowait) + if (infomask & HEAP_XMAX_IS_MULTI) { - if (!ConditionalXactLockTableWait(xwait)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", - RelationGetRelationName(relation)))); + MultiXactStatus status = get_mxact_status_for_tuplelock(mode); + int remain; + + /* We only ever lock tuples, never update them */ + if (status >= MultiXactStatusUpdate) + elog(ERROR, "invalid lock mode in heap_tuple_lock"); + + /* wait for multixact to end */ + if (nowait) + { + if (!ConditionalMultiXactIdWait((MultiXactId) xwait, status, &remain)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + } + else + MultiXactIdWait((MultiXactId) xwait, status, &remain); + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact could + * update this tuple before we get to this point. Check for xmax + * change, and start over if so. + */ + if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data), + xwait)) + goto l3; + + /* + * Of course, the multixact might not be done here: if we're requesting + * a light lock mode, other transactions with light locks could still + * be alive, as well as locks owned by our own xact or other + * subxacts of this backend. We need to preserve the surviving + * MultiXact members. Note that it isn't absolutely necessary + * in the latter case, but doing so is simpler. + */ + if (remain > 0) + { + keep_xmax = xwait; + keep_xmax_multi = true; + } + else + none_remains = true; } else - XactLockTableWait(xwait); + { + /* wait for regular transaction to end */ + if (nowait) + { + if (!ConditionalXactLockTableWait(xwait)) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + } + else + XactLockTableWait(xwait); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. - */ - if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || - !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data), - xwait)) - goto l3; + /* + * xwait is done, but if xwait had just locked the tuple then + * some other xact could update this tuple before we get to + * this point. Check for xmax change, and start over if so. + */ + if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data), + xwait)) + goto l3; - /* Otherwise check if it committed or aborted */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + /* + * Otherwise check if it committed or aborted. Note we cannot + * be here if the tuple was only locked by somebody who didn't + * conflict with us; that should have been handled above. So + * that transaction must necessarily be gone by now. + */ + UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + } } /* * We may lock if previous xmax aborted, or if it committed but only - * locked the tuple without updating it. The case where we didn't - * wait because we are joining an existing shared lock is correctly - * handled, too. + * locked the tuple without updating it; or if we didn't have to wait + * at all for whatever reason. */ - if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | - HEAP_IS_LOCKED)) + if (!require_sleep || + (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsLocked(tuple->t_data) || + none_remains) result = HeapTupleMayBeUpdated; else result = HeapTupleUpdated; @@ -3379,10 +3785,10 @@ l3: Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated); Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = tuple->t_data->t_ctid; - *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data); + *update_xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); if (have_tuple_lock) - UnlockTuple(relation, tid, tuple_lock_type); + UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode)); return result; } @@ -3394,8 +3800,10 @@ l3: * for cases where it is a plain TransactionId. * * Note in particular that this covers the case where we already hold - * exclusive lock on the tuple and the caller only wants shared lock. It - * would certainly not do to give up the exclusive lock. + * exclusive lock on the tuple and the caller only wants key share or share + * lock. It would certainly not do to give up the exclusive lock. Note + * there's no explicit test for a share lock only; this was already covered + * above, because it's only representable by a MultiXactId. */ xmax = HeapTupleHeaderGetXmax(tuple->t_data); old_infomask = tuple->t_data->t_infomask; @@ -3403,15 +3811,15 @@ l3: if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED | HEAP_XMAX_IS_MULTI)) && - (mode == LockTupleShared ? - (old_infomask & HEAP_IS_LOCKED) : - (old_infomask & HEAP_XMAX_EXCL_LOCK)) && - TransactionIdIsCurrentTransactionId(xmax)) + (mode == LockTupleKeyShare ? + (old_infomask & (HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK)) : + (old_infomask & HEAP_XMAX_EXCL_LOCK) && + TransactionIdIsCurrentTransactionId(xmax))) { LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); /* Probably can't hold tuple lock here, but may as well check */ if (have_tuple_lock) - UnlockTuple(relation, tid, tuple_lock_type); + UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode)); return HeapTupleMayBeUpdated; } @@ -3425,22 +3833,69 @@ l3: new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); - if (mode == LockTupleShared) + /* + * if we have keep_xmax, this is easy to compute -- just create a new mxact + * including our new xid plus whatever there was on Xmax previously. + */ + if (TransactionIdIsValid(keep_xmax)) { - /* - * If this is the first acquisition of a shared lock in the current - * transaction, set my per-backend OldestMemberMXactId setting. We can - * be certain that the transaction will never become a member of any - * older MultiXactIds than that. (We have to do this even if we end - * up just using our own TransactionId below, since some other backend - * could incorporate our XID into a MultiXact immediately afterwards.) - */ - MultiXactIdSetOldestMember(); + if (keep_xmax_multi) + { + /* + * MultiXactIdExpand takes care to remove members that are no + * longer current. + */ + xid = MultiXactIdExpand((MultiXactId) keep_xmax, xid, + get_mxact_status_for_tuplelock(mode)); + new_infomask |= GetMultiXactIdHintBits(xid); + } + else if (TransactionIdIsInProgress(keep_xmax)) + { + MultiXactStatus existing_lock_mode; - new_infomask |= HEAP_XMAX_SHARED_LOCK; + if (old_infomask & HEAP_XMAX_EXCL_LOCK) + existing_lock_mode = MultiXactStatusForUpdate; + else if (old_infomask & HEAP_XMAX_KEYSHR_LOCK) + existing_lock_mode = MultiXactStatusForKeyShare; + else + /* must be a shared lock */ + existing_lock_mode = MultiXactStatusForShare; + + xid = MultiXactIdCreate(keep_xmax, existing_lock_mode, + xid, get_mxact_status_for_tuplelock(mode)); + new_infomask |= GetMultiXactIdHintBits(xid); + } + else + { + /* + * Not multi, not in progress. Use only our own Xid. + */ + switch (mode) + { + case LockTupleKeyShare: + new_infomask |= HEAP_XMAX_KEYSHR_LOCK; + break; + case LockTupleShare: + /* need a multixact here in any case */ + xid = MultiXactIdCreateSingleton(xid, MultiXactStatusForShare); + new_infomask |= GetMultiXactIdHintBits(xid); + break; + case LockTupleUpdate: + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + default: + elog(ERROR, "invalid lock mode"); + } + } + } + else + { + MultiXactStatus new_mxact_status; + + new_mxact_status = get_mxact_status_for_tuplelock(mode); /* * Check to see if we need a MultiXactId because there are multiple @@ -3465,8 +3920,9 @@ l3: * If the XMAX is already a MultiXactId, then we need to * expand it to include our own TransactionId. */ - xid = MultiXactIdExpand((MultiXactId) xmax, xid); - new_infomask |= HEAP_XMAX_IS_MULTI; + xid = MultiXactIdExpand((MultiXactId) xmax, xid, new_mxact_status); + new_infomask |= GetMultiXactIdHintBits(xid); + /* FIXME -- we need to add bits to the infomask here! */ } else if (TransactionIdIsInProgress(xmax)) { @@ -3475,8 +3931,30 @@ l3: * create a new MultiXactId that includes both the old locker * and our own TransactionId. */ - xid = MultiXactIdCreate(xmax, xid); - new_infomask |= HEAP_XMAX_IS_MULTI; + MultiXactStatus status; + + if (old_infomask & HEAP_XMAX_EXCL_LOCK) + status = MultiXactStatusForUpdate; + else if (old_infomask & HEAP_XMAX_KEYSHR_LOCK) + status = MultiXactStatusForKeyShare; + else + { + status = 0; /* keep compiler quiet */ + elog(ERROR, "no lock bit found on old infomask %u", old_infomask); + } + + xid = MultiXactIdCreate(xmax, status, xid, new_mxact_status); + new_infomask |= GetMultiXactIdHintBits(xid); + /* FIXME -- we need to add bits to the infomask here! */ + } + else if (mode == LockTupleShare) + { + /* + * There's no hint bit for FOR SHARE, so we need a multixact + * here no matter what. + */ + xid = MultiXactIdCreateSingleton(xid, new_mxact_status); + new_infomask |= GetMultiXactIdHintBits(xid); } else { @@ -3486,6 +3964,22 @@ l3: * TransactionIdIsInProgress() got to run. Treat it like * there's no locker in the tuple. */ + switch (mode) + { + case LockTupleKeyShare: + new_infomask |= HEAP_XMAX_KEYSHR_LOCK; + break; + case LockTupleShare: + /* need a multixact here in any case */ + xid = MultiXactIdCreateSingleton(xid, MultiXactStatusForShare); + new_infomask |= GetMultiXactIdHintBits(xid); + break; + case LockTupleUpdate: + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + default: + elog(ERROR, "invalid lock mode"); + } } } else @@ -3494,13 +3988,24 @@ l3: * There was no previous locker, so just insert our own * TransactionId. */ + switch (mode) + { + case LockTupleKeyShare: + new_infomask |= HEAP_XMAX_KEYSHR_LOCK; + break; + case LockTupleShare: + /* need a multixact here in any case */ + xid = MultiXactIdCreateSingleton(xid, MultiXactStatusForShare); + new_infomask |= GetMultiXactIdHintBits(xid); + break; + case LockTupleUpdate: + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + default: + elog(ERROR, "invalid lock mode"); + } } } - else - { - /* We want an exclusive lock on the tuple */ - new_infomask |= HEAP_XMAX_EXCL_LOCK; - } START_CRIT_SECTION(); @@ -3508,12 +4013,14 @@ l3: * Store transaction information of xact locking the tuple. * * Note: Cmax is meaningless in this context, so don't set it; this avoids - * possibly generating a useless combo CID. + * possibly generating a useless combo CID. FIXME -- it's not useless + * if a multixact contains an update. */ tuple->t_data->t_infomask = new_infomask; HeapTupleHeaderClearHotUpdated(tuple->t_data); HeapTupleHeaderSetXmax(tuple->t_data, xid); /* Make sure there is no forward chain link in t_ctid */ + /* FIXME -- this needs some thought */ tuple->t_data->t_ctid = *tid; MarkBufferDirty(*buffer); @@ -3539,8 +4046,17 @@ l3: xlrec.target.node = relation->rd_node; xlrec.target.tid = tuple->t_self; xlrec.locking_xid = xid; - xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0); - xlrec.shared_lock = (mode == LockTupleShared); + xlrec.infobits_set = + (((new_infomask & HEAP_XMAX_IS_MULTI) != 0) ? + XLHL_XMAX_IS_MULTI : 0) | + (((new_infomask & HEAP_XMAX_IS_NOT_UPDATE) != 0) ? + XLHL_XMAX_IS_NOT_UPDATE : 0) | + (((new_infomask & HEAP_XMAX_EXCL_LOCK) != 0) ? + XLHL_XMAX_EXCL_LOCK : 0) | + (((new_infomask & HEAP_XMAX_KEYSHR_LOCK) != 0) ? + XLHL_XMAX_KEYSHR_LOCK : 0) | + (((tuple->t_data->t_infomask2 & HEAP_UPDATE_KEY_INTACT) != 0) ? + XLHL_XMAX_KEYSHR_LOCK : 0); rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapLock; rdata[0].buffer = InvalidBuffer; @@ -3572,7 +4088,7 @@ l3: * release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) - UnlockTuple(relation, tid, tuple_lock_type); + UnlockTuple(relation, tid, get_lockmode_for_tuplelock(mode)); return HeapTupleMayBeUpdated; } @@ -3789,6 +4305,8 @@ recheck_xmax: * extremely low-probability scenario with minimal downside even if * it does happen, so for now we don't do the extra bookkeeping that * would be needed to clean out MultiXactIds. + * + * FIXME -- today is that day. Figure this out. *---------- */ } @@ -3841,6 +4359,105 @@ recheck_xvac: return changed; } +/* + * For a given MultiXactId, return the hint bits that should be set in the + * tuple's infomask. + * + * Normally this should be called for a multixact that was just created, and + * so is on our local cache, so the GetMembers call is fast. + */ +static uint16 +GetMultiXactIdHintBits(MultiXactId multi) +{ + int nmembers; + MultiXactMember *members; + int i; + uint16 bits = HEAP_XMAX_IS_MULTI; + bool has_update = false; + + nmembers = GetMultiXactIdMembers(multi, &members); + + for (i = 0; i < nmembers; i++) + { + Assert(members[i].status != MultiXactStatusKeyUpdate); + switch (members[i].status) + { + case MultiXactStatusForKeyShare: + bits |= HEAP_XMAX_KEYSHR_LOCK; + break; + case MultiXactStatusForShare: + break; + case MultiXactStatusForUpdate: + Assert(!has_update); + bits |= HEAP_XMAX_EXCL_LOCK; + break; + case MultiXactStatusUpdate: + Assert(!(bits & HEAP_XMAX_EXCL_LOCK)); + has_update = true; + break; + case MultiXactStatusKeyUpdate: + elog(ERROR, "invalid multixact value"); + break; + } + } + if (!has_update) + bits |= HEAP_XMAX_IS_NOT_UPDATE; + + return bits; +} + +/* + * HeapTupleGetUpdateXid + * + * Given a tuple with a multixact Xmax, and which does not have the + * HEAP_XMAX_IS_NOT_UPDATE bit set, obtain and return the Xid of the updating + * transaction. + * + * See also HeapTupleHeaderGetUpdateXid, which can be used without previously + * checking the hint bits. + */ +TransactionId +HeapTupleGetUpdateXid(HeapTupleHeader tuple) +{ + TransactionId update_xact = InvalidTransactionId; + MultiXactMember *members; + int nmembers; + + Assert(!(tuple->t_infomask & HEAP_XMAX_IS_NOT_UPDATE)); + Assert(tuple->t_infomask & HEAP_XMAX_IS_MULTI); + + nmembers = GetMultiXactIdMembers(HeapTupleHeaderGetXmax(tuple), &members); + + if (nmembers > 0) + { + int i; + + for (i = 0; i < nmembers; i++) + { + /* KEY SHARE lockers are okay -- ignore it */ + if (members[i].status == MultiXactStatusForKeyShare) + continue; + /* + * SHARE lockers are okay, though since they normally conflict with + * UPDATE, they are not expected unless they come from the same + * xact as the update. + */ + if (members[i].status == MultiXactStatusForShare || + members[i].status == MultiXactStatusForUpdate) + continue; + /* there should be at most one updater */ + Assert(update_xact == InvalidTransactionId); + Assert(members[i].status == MultiXactStatusUpdate); + update_xact = members[i].xid; +#ifndef USE_ASSERT_CHECKING + break; +#endif + } + } + + return update_xact; +} + /* ---------------- * heap_markpos - mark scan position @@ -3919,6 +4536,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId *latestRemovedXid) { TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + /* FIXME -- change this? */ TransactionId xmax = HeapTupleHeaderGetXmax(tuple); TransactionId xvac = HeapTupleHeaderGetXvac(tuple); @@ -4606,7 +5224,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXmax(htup, record->xl_xid); @@ -4813,7 +5431,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); if (hot_update) HeapTupleHeaderSetHotUpdated(htup); @@ -4991,14 +5609,18 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | + HEAP_LOCK_BITS | HEAP_MOVED); - if (xlrec->xid_is_mxact) + if (xlrec->infobits_set & XLHL_XMAX_IS_MULTI) htup->t_infomask |= HEAP_XMAX_IS_MULTI; - if (xlrec->shared_lock) - htup->t_infomask |= HEAP_XMAX_SHARED_LOCK; - else + if (xlrec->infobits_set & XLHL_XMAX_IS_NOT_UPDATE) + htup->t_infomask |= HEAP_XMAX_IS_NOT_UPDATE; + if (xlrec->infobits_set & XLHL_XMAX_EXCL_LOCK) htup->t_infomask |= HEAP_XMAX_EXCL_LOCK; + if (xlrec->infobits_set & XLHL_XMAX_KEYSHR_LOCK) + htup->t_infomask |= HEAP_XMAX_KEYSHR_LOCK; + if (xlrec->infobits_set & XLHL_UPDATE_KEY_INTACT) + htup->t_infomask2 |= HEAP_UPDATE_KEY_INTACT; HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); @@ -5202,16 +5824,19 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec) { xl_heap_lock *xlrec = (xl_heap_lock *) rec; - if (xlrec->shared_lock) - appendStringInfo(buf, "shared_lock: "); - else - appendStringInfo(buf, "exclusive_lock: "); - if (xlrec->xid_is_mxact) - appendStringInfo(buf, "mxid "); - else - appendStringInfo(buf, "xid "); - appendStringInfo(buf, "%u ", xlrec->locking_xid); + appendStringInfo(buf, "lock %u: ", xlrec->locking_xid); out_target(buf, &(xlrec->target)); + appendStringInfoChar(buf, ' '); + if (xlrec->infobits_set & XLHL_XMAX_IS_MULTI) + appendStringInfo(buf, "XMAX_IS_MULTI "); + if (xlrec->infobits_set & XLHL_XMAX_IS_NOT_UPDATE) + appendStringInfo(buf, "XMAX_IS_NOT_UPDATE "); + if (xlrec->infobits_set & XLHL_XMAX_EXCL_LOCK) + appendStringInfo(buf, "XMAX_EXCL_LOCK "); + if (xlrec->infobits_set & XLHL_XMAX_KEYSHR_LOCK) + appendStringInfo(buf, "XMAX_KEYSHR_LOCK "); + if (xlrec->infobits_set & XLHL_UPDATE_KEY_INTACT) + appendStringInfo(buf, "UPDATE_KEY_INTACT "); } else if (info == XLOG_HEAP_INPLACE) { diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index e561409..3469ebe 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -352,15 +352,15 @@ rewrite_heap_tuple(RewriteState state, /* * If the tuple has been updated, check the old-to-new mapping hash table. */ - if (!(old_tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | - HEAP_IS_LOCKED)) && + if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HeapTupleHeaderIsLocked(old_tuple->t_data)) && !(ItemPointerEquals(&(old_tuple->t_self), &(old_tuple->t_data->t_ctid)))) { OldToNewMapping mapping; memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetXmax(old_tuple->t_data); + hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); hashkey.tid = old_tuple->t_data->t_ctid; mapping = (OldToNewMapping) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index c1c8ba5..a4dc146 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -4,7 +4,7 @@ * PostgreSQL multi-transaction-log manager * * The pg_multixact manager is a pg_clog-like manager that stores an array - * of TransactionIds for each MultiXactId. It is a fundamental part of the + * of MultiXactMember for each MultiXactId. It is a fundamental part of the * shared-row-lock implementation. A share-locked tuple stores a * MultiXactId in its Xmax, and a transaction that needs to wait for the * tuple to be unlocked can sleep on the potentially-several TransactionIds @@ -48,6 +48,8 @@ */ #include "postgres.h" +#include + #include "access/multixact.h" #include "access/slru.h" #include "access/transam.h" @@ -60,6 +62,7 @@ #include "storage/procarray.h" #include "utils/builtins.h" #include "utils/memutils.h" +#include "utils/snapmgr.h" /* @@ -75,19 +78,58 @@ * (see MultiXact{Offset,Member}PagePrecedes). */ -/* We need four bytes per offset and also four bytes per member */ +/* We need four bytes per offset */ #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) -#define MULTIXACT_MEMBERS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) #define MultiXactIdToOffsetPage(xid) \ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) #define MultiXactIdToOffsetEntry(xid) \ ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) -#define MXOffsetToMemberPage(xid) \ - ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) -#define MXOffsetToMemberEntry(xid) \ - ((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) +/* + * The situation for members is a bit more complex: we need to store two + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags (so 16 bit pairs), and + * then the corresponding 16 Xids. Each such 17-word (68-byte) set we call a + * "group", and are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep + * 120 groups per page. This wastes 32 bytes per page, but that's OK -- + * simplicity (and performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need two bits per xact, so four xacts fit in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 2 +#define MXACT_MEMBER_FLAGS_PER_BYTE 4 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) + +/* Location (byte offset within page) of flag word for a given member */ +#define MXOffsetToFlagsOffset(xid) \ + ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ + (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \ + (TransactionId) MULTIXACT_MEMBERGROUP_SIZE) +#define MXOffsetToFlagsBitShift(xid) \ + (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \ + MXACT_MEMBER_BITS_PER_XACT) + +/* Location (byte offset within page) of TransactionId of given member */ +#define MXOffsetToMemberOffset(xid) \ + (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \ + ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId)) /* @@ -114,60 +156,51 @@ typedef struct MultiXactStateData /* next-to-be-assigned offset */ MultiXactOffset nextOffset; - /* the Offset SLRU area was last truncated at this MultiXactId */ - MultiXactId lastTruncationPoint; + /* truncation info for the oldest segment in the offset SLRU area */ + TransactionId truncateXid; + uint32 truncateXidEpoch; /* - * Per-backend data starts here. We have two arrays stored in the area - * immediately following the MultiXactStateData struct. Each is indexed by - * BackendId. - * - * In both arrays, there's a slot for all normal backends (1..MaxBackends) - * followed by a slot for max_prepared_xacts prepared transactions. Valid - * BackendIds start from 1; element zero of each array is never used. - * - * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current - * transaction(s) could possibly be a member of, or InvalidMultiXactId - * when the backend has no live transaction that could possibly be a - * member of a MultiXact. Each backend sets its entry to the current - * nextMXact counter just before first acquiring a shared lock in a given - * transaction, and clears it at transaction end. (This works because only - * during or after acquiring a shared lock could an XID possibly become a - * member of a MultiXact, and that MultiXact would have to be created - * during or after the lock acquisition.) - * - * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's - * current transaction(s) think is potentially live, or InvalidMultiXactId - * when not in a transaction or not in a transaction that's paid any - * attention to MultiXacts yet. This is computed when first needed in a - * given transaction, and cleared at transaction end. We can compute it - * as the minimum of the valid OldestMemberMXactId[] entries at the time - * we compute it (using nextMXact if none are valid). Each backend is - * required not to attempt to access any SLRU data for MultiXactIds older - * than its own OldestVisibleMXactId[] setting; this is necessary because - * the checkpointer could truncate away such data at any instant. - * - * The checkpointer can compute the safe truncation point as the oldest - * valid value among all the OldestMemberMXactId[] and - * OldestVisibleMXactId[] entries, or nextMXact if none are valid. - * Clearly, it is not possible for any later-computed OldestVisibleMXactId - * value to be older than this, and so there is no risk of truncating data - * that is still needed. + * oldest multixact that is still on disk. Anything older than this should + * not be consulted. */ - MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */ + MultiXactId oldestMultiXactId; } MultiXactStateData; +/* Pointer to the state data in shared memory */ +static MultiXactStateData *MultiXactState; + +#define firstPageOf(segment) ((segment) * SLRU_PAGES_PER_SEGMENT) + /* - * Last element of OldestMemberMXactID and OldestVisibleMXactId arrays. - * Valid elements are (1..MaxOldestSlot); element 0 is never used. + * structs to pass data around in our private SlruScanDirectory callback for + * the offset truncation support code. */ -#define MaxOldestSlot (MaxBackends + max_prepared_xacts) +typedef struct SegmentInfo +{ + int segno; /* segment number */ + TransactionId truncateXid; /* after this Xid is frozen, the previous + * segment can be removed */ + uint32 truncateXidEpoch; /* epoch of above Xid */ + MultiXactOffset firstOffset; /* first valid offset in segment */ +} SegmentInfo; -/* Pointers to the state data in shared memory */ -static MultiXactStateData *MultiXactState; -static MultiXactId *OldestMemberMXactId; -static MultiXactId *OldestVisibleMXactId; +typedef struct TruncateCbData +{ + int remaining_alloc; + int remaining_used; + SegmentInfo *remaining; +} TruncateCbData; +/* + * MultiXactZeroOffsetPage xlog record + */ +typedef struct MxactZeroOffPg +{ + int pageno; + TransactionId truncateXid; + TransactionId truncateXidEpoch; +} MxactZeroOffPg; /* * Definitions for the backend-local MultiXactId cache. @@ -180,7 +213,8 @@ static MultiXactId *OldestVisibleMXactId; * so they will be uninteresting by the time our next transaction starts. * (XXX not clear that this is correct --- other members of the MultiXact * could hang around longer than we did. However, it's not clear what a - * better policy for flushing old cache entries would be.) + * better policy for flushing old cache entries would be.) FIXME actually + * this is plain wrong now that multixact's may contain update Xids. * * We allocate the cache entries in a memory context that is deleted at * transaction end, so we don't need to do retail freeing of entries. @@ -189,44 +223,72 @@ typedef struct mXactCacheEnt { struct mXactCacheEnt *next; MultiXactId multi; - int nxids; - TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */ + int nmembers; + MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; } mXactCacheEnt; static mXactCacheEnt *MXactCache = NULL; static MemoryContext MXactContext = NULL; +/* status conflict table */ +static const bool MultiXactConflicts[5][5] = +{ + { /* ForKeyShare */ + false, false, false, false, true + }, + { /* ForShare */ + false, false, true, true, true + }, + { /* ForUpdate */ + false, true, true, true, true + }, + { /* Update */ + false, true, true, true, true + }, + { /* KeyUpdate */ + true, true, true, true, true + } +}; + +#define MultiXactStatusConflict(status1, status2) \ + MultiXactConflicts[status1][status2] + +#define MULTIXACT_DEBUG #ifdef MULTIXACT_DEBUG #define debug_elog2(a,b) elog(a,b) #define debug_elog3(a,b,c) elog(a,b,c) #define debug_elog4(a,b,c,d) elog(a,b,c,d) #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) +#define debug_elog7(a,b,c,d,e,f,g) elog(a,b,c,d,e,f,g) #else #define debug_elog2(a,b) #define debug_elog3(a,b,c) #define debug_elog4(a,b,c,d) #define debug_elog5(a,b,c,d,e) +#define debug_elog7(a,b,c,d,e,f,g) #endif /* internal MultiXactId management */ -static void MultiXactIdSetOldestVisible(void); -static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids); +static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, - int nxids, TransactionId *xids); -static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset); + int nmembers, MultiXactMember *members); +static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); +static MultiXactId HandleMxactOffsetCornerCases(MultiXactId multi); /* MultiXact cache management */ -static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids); -static int mXactCacheGetById(MultiXactId multi, TransactionId **xids); -static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids); +static int mxactMemberComparator(const void *arg1, const void *arg2); +static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); +static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); +static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members); #ifdef MULTIXACT_DEBUG -static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids); +static char *mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members); #endif /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); +static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog, + TransactionId truncateXid, uint32 truncateXidEpoch); static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int page1, int page2); static bool MultiXactMemberPagePrecedes(int page1, int page2); @@ -235,29 +297,59 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static void TruncateMultiXact(void); -static void WriteMZeroPageXlogRec(int pageno, uint8 info); +static void fillSegmentInfoData(SlruCtl ctl, SegmentInfo *segment); +static int compareTruncateXidEpoch(const void *a, const void *b); +static void WriteMZeroOffsetPageXlogRec(int pageno, TransactionId truncateXid, + uint32 truncateXidEpoch); +static void WriteMZeroMemberPageXlogRec(int pageno); /* + * MultiXactIdCreateSingleton + * Construct a MultiXactId representing a single transaction. + * + * NB - we don't worry about our local MultiXactId cache here, because that + * is handled by the lower-level routines. + */ +MultiXactId +MultiXactIdCreateSingleton(TransactionId xid, MultiXactStatus status) +{ + MultiXactId newMulti; + MultiXactMember member[1]; + + AssertArg(TransactionIdIsValid(xid)); + + member[0].xid = xid; + member[0].status = status; + + newMulti = CreateMultiXactId(1, member); + + debug_elog4(DEBUG2, "Create: returning %u for %u", + newMulti, xid); + + return newMulti; +} + +/* * MultiXactIdCreate * Construct a MultiXactId representing two TransactionIds. * - * The two XIDs must be different. + * The two XIDs must be different, or be requesting different lock modes. * * NB - we don't worry about our local MultiXactId cache here, because that * is handled by the lower-level routines. */ MultiXactId -MultiXactIdCreate(TransactionId xid1, TransactionId xid2) +MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, + TransactionId xid2, MultiXactStatus status2) { MultiXactId newMulti; - TransactionId xids[2]; + MultiXactMember members[2]; AssertArg(TransactionIdIsValid(xid1)); AssertArg(TransactionIdIsValid(xid2)); - Assert(!TransactionIdEquals(xid1, xid2)); + Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); /* * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs @@ -265,11 +357,14 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2) * caller just did a check on xid1, so it'd be wasted effort. */ - xids[0] = xid1; - xids[1] = xid2; + members[0].xid = xid1; + members[0].status = status1; + members[1].xid = xid2; + members[1].status = status2; - newMulti = CreateMultiXactId(2, xids); + newMulti = CreateMultiXactId(2, members); + /* XXX -- need better debug? */ debug_elog5(DEBUG2, "Create: returning %u for %u, %u", newMulti, xid1, xid2); @@ -280,8 +375,8 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2) * MultiXactIdExpand * Add a TransactionId to a pre-existing MultiXactId. * - * If the TransactionId is already a member of the passed MultiXactId, - * just return it as-is. + * If the TransactionId is already a member of the passed MultiXactId with the + * same status, just return it as-is. * * Note that we do NOT actually modify the membership of a pre-existing * MultiXactId; instead we create a new one. This is necessary to avoid @@ -291,11 +386,11 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2) * is handled by the lower-level routines. */ MultiXactId -MultiXactIdExpand(MultiXactId multi, TransactionId xid) +MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) { MultiXactId newMulti; - TransactionId *members; - TransactionId *newMembers; + MultiXactMember *members; + MultiXactMember *newMembers; int nmembers; int i; int j; @@ -310,6 +405,8 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid) if (nmembers < 0) { + MultiXactMember member; + /* * The MultiXactId is obsolete. This can only happen if all the * MultiXactId members stop running between the caller checking and @@ -317,7 +414,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid) * caller, but it would complicate the API and it's unlikely to happen * too often, so just deal with it by creating a singleton MultiXact. */ - newMulti = CreateMultiXactId(1, &xid); + member.xid = xid; + member.status = status; + newMulti = CreateMultiXactId(1, &member); debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", multi, newMulti); @@ -325,12 +424,13 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid) } /* - * If the TransactionId is already a member of the MultiXactId, just - * return the existing MultiXactId. + * If the TransactionId is already a member of the MultiXactId with the + * same status, just return the existing MultiXactId. */ for (i = 0; i < nmembers; i++) { - if (TransactionIdEquals(members[i], xid)) + if (TransactionIdEquals(members[i].xid, xid) && + (members[i].status == status)) { debug_elog4(DEBUG2, "Expand: %u is already a member of %u", xid, multi); @@ -345,16 +445,20 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid) * optimization, but a useful one. Note we have the same race condition * here as above: j could be 0 at the end of the loop.) */ - newMembers = (TransactionId *) - palloc(sizeof(TransactionId) * (nmembers + 1)); + newMembers = (MultiXactMember *) + palloc(sizeof(MultiXactMember) * (nmembers + 1)); for (i = 0, j = 0; i < nmembers; i++) { - if (TransactionIdIsInProgress(members[i])) - newMembers[j++] = members[i]; + if (TransactionIdIsInProgress(members[i].xid)) + { + newMembers[j].xid = members[i].xid; + newMembers[j++].status = members[i].status; + } } - newMembers[j++] = xid; + newMembers[j].xid = xid; + newMembers[j++].status = status; newMulti = CreateMultiXactId(j, newMembers); pfree(members); @@ -376,7 +480,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid) bool MultiXactIdIsRunning(MultiXactId multi) { - TransactionId *members; + MultiXactMember *members; int nmembers; int i; @@ -397,7 +501,7 @@ MultiXactIdIsRunning(MultiXactId multi) */ for (i = 0; i < nmembers; i++) { - if (TransactionIdIsCurrentTransactionId(members[i])) + if (TransactionIdIsCurrentTransactionId(members[i].xid)) { debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i); pfree(members); @@ -412,10 +516,10 @@ MultiXactIdIsRunning(MultiXactId multi) */ for (i = 0; i < nmembers; i++) { - if (TransactionIdIsInProgress(members[i])) + if (TransactionIdIsInProgress(members[i].xid)) { debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running", - i, members[i]); + i, members[i].xid); pfree(members); return true; } @@ -429,145 +533,6 @@ MultiXactIdIsRunning(MultiXactId multi) } /* - * MultiXactIdIsCurrent - * Returns true if the current transaction is a member of the MultiXactId. - * - * We return true if any live subtransaction of the current top-level - * transaction is a member. This is appropriate for the same reason that a - * lock held by any such subtransaction is globally equivalent to a lock - * held by the current subtransaction: no such lock could be released without - * aborting this subtransaction, and hence releasing its locks. So it's not - * necessary to add the current subxact to the MultiXact separately. - */ -bool -MultiXactIdIsCurrent(MultiXactId multi) -{ - bool result = false; - TransactionId *members; - int nmembers; - int i; - - nmembers = GetMultiXactIdMembers(multi, &members); - - if (nmembers < 0) - return false; - - for (i = 0; i < nmembers; i++) - { - if (TransactionIdIsCurrentTransactionId(members[i])) - { - result = true; - break; - } - } - - pfree(members); - - return result; -} - -/* - * MultiXactIdSetOldestMember - * Save the oldest MultiXactId this transaction could be a member of. - * - * We set the OldestMemberMXactId for a given transaction the first time - * it's going to acquire a shared lock. We need to do this even if we end - * up using a TransactionId instead of a MultiXactId, because there is a - * chance that another transaction would add our XID to a MultiXactId. - * - * The value to set is the next-to-be-assigned MultiXactId, so this is meant - * to be called just before acquiring a shared lock. - */ -void -MultiXactIdSetOldestMember(void) -{ - if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])) - { - MultiXactId nextMXact; - - /* - * You might think we don't need to acquire a lock here, since - * fetching and storing of TransactionIds is probably atomic, but in - * fact we do: suppose we pick up nextMXact and then lose the CPU for - * a long time. Someone else could advance nextMXact, and then - * another someone else could compute an OldestVisibleMXactId that - * would be after the value we are going to store when we get control - * back. Which would be wrong. - */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - - /* - * We have to beware of the possibility that nextMXact is in the - * wrapped-around state. We don't fix the counter itself here, but we - * must be sure to store a valid value in our array entry. - */ - nextMXact = MultiXactState->nextMXact; - if (nextMXact < FirstMultiXactId) - nextMXact = FirstMultiXactId; - - OldestMemberMXactId[MyBackendId] = nextMXact; - - LWLockRelease(MultiXactGenLock); - - debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u", - MyBackendId, nextMXact); - } -} - -/* - * MultiXactIdSetOldestVisible - * Save the oldest MultiXactId this transaction considers possibly live. - * - * We set the OldestVisibleMXactId for a given transaction the first time - * it's going to inspect any MultiXactId. Once we have set this, we are - * guaranteed that the checkpointer won't truncate off SLRU data for - * MultiXactIds at or after our OldestVisibleMXactId. - * - * The value to set is the oldest of nextMXact and all the valid per-backend - * OldestMemberMXactId[] entries. Because of the locking we do, we can be - * certain that no subsequent call to MultiXactIdSetOldestMember can set - * an OldestMemberMXactId[] entry older than what we compute here. Therefore - * there is no live transaction, now or later, that can be a member of any - * MultiXactId older than the OldestVisibleMXactId we compute here. - */ -static void -MultiXactIdSetOldestVisible(void) -{ - if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId])) - { - MultiXactId oldestMXact; - int i; - - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - - /* - * We have to beware of the possibility that nextMXact is in the - * wrapped-around state. We don't fix the counter itself here, but we - * must be sure to store a valid value in our array entry. - */ - oldestMXact = MultiXactState->nextMXact; - if (oldestMXact < FirstMultiXactId) - oldestMXact = FirstMultiXactId; - - for (i = 1; i <= MaxOldestSlot; i++) - { - MultiXactId thisoldest = OldestMemberMXactId[i]; - - if (MultiXactIdIsValid(thisoldest) && - MultiXactIdPrecedes(thisoldest, oldestMXact)) - oldestMXact = thisoldest; - } - - OldestVisibleMXactId[MyBackendId] = oldestMXact; - - LWLockRelease(MultiXactGenLock); - - debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u", - MyBackendId, oldestMXact); - } -} - -/* * MultiXactIdWait * Sleep on a MultiXactId. * @@ -576,17 +541,24 @@ MultiXactIdSetOldestVisible(void) * this would not merely be useless but would lead to Assert failure inside * XactLockTableWait. By the time this returns, it is certain that all * transactions *of other backends* that were members of the MultiXactId - * are dead (and no new ones can have been added, since it is not legal - * to add members to an existing MultiXactId). + * that conflict with the requested status are dead (and no new ones can have + * been added, since it is not legal to add members to an existing + * MultiXactId). + * + * We return the number of members that we did not test for. This is dubbed + * "remaining" as in "the number of members that remaing running", but this is + * slightly incorrect, because lockers whose status did not conflict with ours + * are not even considered and so might have gone away anyway. * * But by the time we finish sleeping, someone else may have changed the Xmax * of the containing tuple, so the caller needs to iterate on us somehow. */ void -MultiXactIdWait(MultiXactId multi) +MultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining) { - TransactionId *members; + MultiXactMember *members; int nmembers; + int remain = 0; nmembers = GetMultiXactIdMembers(multi, &members); @@ -596,28 +568,37 @@ MultiXactIdWait(MultiXactId multi) for (i = 0; i < nmembers; i++) { - TransactionId member = members[i]; - debug_elog4(DEBUG2, "MultiXactIdWait: waiting for %d (%u)", - i, member); - if (!TransactionIdIsCurrentTransactionId(member)) - XactLockTableWait(member); - } + i, members[i].xid); + if (TransactionIdIsCurrentTransactionId(members[i].xid) || + !MultiXactStatusConflict(members[i].status, status)) + { + remain++; + continue; + } - pfree(members); + XactLockTableWait(members[i].xid); + } } + + *remaining = remain; } /* * ConditionalMultiXactIdWait * As above, but only lock if we can get the lock without blocking. + * + * Note that in case we return false, the number of remaining members is + * not to be trusted. */ bool -ConditionalMultiXactIdWait(MultiXactId multi) +ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, + int *remaining) { bool result = true; - TransactionId *members; + MultiXactMember *members; int nmembers; + int remain = 0; nmembers = GetMultiXactIdMembers(multi, &members); @@ -627,21 +608,26 @@ ConditionalMultiXactIdWait(MultiXactId multi) for (i = 0; i < nmembers; i++) { - TransactionId member = members[i]; + TransactionId member = members[i].xid; debug_elog4(DEBUG2, "ConditionalMultiXactIdWait: trying %d (%u)", i, member); - if (!TransactionIdIsCurrentTransactionId(member)) + if (TransactionIdIsCurrentTransactionId(member) || + !MultiXactStatusConflict(members[i].status, status)) { - result = ConditionalXactLockTableWait(member); - if (!result) - break; + remain++; + continue; } + result = ConditionalXactLockTableWait(member); + if (!result) + break; } pfree(members); } + *remaining = remain; + return result; } @@ -652,10 +638,10 @@ ConditionalMultiXactIdWait(MultiXactId multi) * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the * given TransactionIds as members. Returns the newly created MultiXactId. * - * NB: the passed xids[] array will be sorted in-place. + * NB: the passed members[] array will be sorted in-place. */ static MultiXactId -CreateMultiXactId(int nxids, TransactionId *xids) +CreateMultiXactId(int nmembers, MultiXactMember *members) { MultiXactId multi; MultiXactOffset offset; @@ -663,7 +649,7 @@ CreateMultiXactId(int nxids, TransactionId *xids) xl_multixact_create xlrec; debug_elog3(DEBUG2, "Create: %s", - mxid_to_string(InvalidMultiXactId, nxids, xids)); + mxid_to_string(InvalidMultiXactId, nmembers, members)); /* * See if the same set of XIDs already exists in our cache; if so, just @@ -675,7 +661,7 @@ CreateMultiXactId(int nxids, TransactionId *xids) * corner cases where someone else added us to a MultiXact without our * knowledge, but it's not worth checking for.) */ - multi = mXactCacheGetBySet(nxids, xids); + multi = mXactCacheGetBySet(nmembers, members); if (MultiXactIdIsValid(multi)) { debug_elog2(DEBUG2, "Create: in cache!"); @@ -687,7 +673,7 @@ CreateMultiXactId(int nxids, TransactionId *xids) * in the OFFSETs and MEMBERs files. NB: this routine does * START_CRIT_SECTION(). */ - multi = GetNewMultiXactId(nxids, &offset); + multi = GetNewMultiXactId(nmembers, &offset); /* * Make an XLOG entry describing the new MXID. @@ -704,27 +690,32 @@ CreateMultiXactId(int nxids, TransactionId *xids) */ xlrec.mid = multi; xlrec.moff = offset; - xlrec.nxids = nxids; + xlrec.nmembers = nmembers; + /* + * XXX Note: there's a lot of padding space in MultiXactMember. We could + * find a more compact representation of this Xlog record -- perhaps all the + * status flags in one XLogRecData, then all the xids in another one? + */ rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfMultiXactCreate; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) xids; - rdata[1].len = nxids * sizeof(TransactionId); + rdata[1].data = (char *) members; + rdata[1].len = nmembers * sizeof(MultiXactMember); rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata); /* Now enter the information into the OFFSETs and MEMBERs logs */ - RecordNewMultiXact(multi, offset, nxids, xids); + RecordNewMultiXact(multi, offset, nmembers, members); /* Done with critical section */ END_CRIT_SECTION(); /* Store the new MultiXactId in the local cache, too */ - mXactCachePut(multi, nxids, xids); + mXactCachePut(multi, nmembers, members); debug_elog2(DEBUG2, "Create: all done"); @@ -739,7 +730,7 @@ CreateMultiXactId(int nxids, TransactionId *xids) */ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, - int nxids, TransactionId *xids) + int nmembers, MultiXactMember *members) { int pageno; int prev_pageno; @@ -775,12 +766,22 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, prev_pageno = -1; - for (i = 0; i < nxids; i++, offset++) + for (i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + /* this status value is not representable on disk */ + Assert(members[i].status < MultiXactStatusKeyUpdate); pageno = MXOffsetToMemberPage(offset); - entryno = MXOffsetToMemberEntry(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); if (pageno != prev_pageno) { @@ -789,10 +790,17 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, } memberptr = (TransactionId *) - MultiXactMemberCtl->shared->page_buffer[slotno]; - memberptr += entryno; + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - *memberptr = xids[i]; + *memberptr = members[i].xid; + + flagsptr = (uint32 *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; MultiXactMemberCtl->shared->page_dirty[slotno] = true; } @@ -816,21 +824,18 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, * caller must end the critical section after writing SLRU data. */ static MultiXactId -GetNewMultiXactId(int nxids, MultiXactOffset *offset) +GetNewMultiXactId(int nmembers, MultiXactOffset *offset) { MultiXactId result; MultiXactOffset nextOffset; - debug_elog3(DEBUG2, "GetNew: for %d xids", nxids); - - /* MultiXactIdSetOldestMember() must have been called already */ - Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - /* Handle wraparound of the nextMXact counter */ - if (MultiXactState->nextMXact < FirstMultiXactId) - MultiXactState->nextMXact = FirstMultiXactId; + /* Handle corner cases of the nextMXact counter */ + MultiXactState->nextMXact = + HandleMxactOffsetCornerCases(MultiXactState->nextMXact); /* * Assign the MXID, and make sure there is room for it in the file. @@ -848,12 +853,12 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset) if (nextOffset == 0) { *offset = 1; - nxids++; /* allocate member slot 0 too */ + nmembers++; /* allocate member slot 0 too */ } else *offset = nextOffset; - ExtendMultiXactMember(nextOffset, nxids); + ExtendMultiXactMember(nextOffset, nmembers); /* * Critical section from here until caller has written the data into the @@ -870,13 +875,14 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset) * * We don't care about MultiXactId wraparound here; it will be handled by * the next iteration. But note that nextMXact may be InvalidMultiXactId - * after this routine exits, so anyone else looking at the variable must - * be prepared to deal with that. Similarly, nextOffset may be zero, but - * we won't use that as the actual start offset of the next multixact. + * or the first value on a segment-beggining page after this routine exits, + * so anyone else looking at the variable must be prepared to deal with + * either case. Similarly, nextOffset may be zero, but we won't use that + * as the actual start offset of the next multixact. */ (MultiXactState->nextMXact)++; - MultiXactState->nextOffset += nxids; + MultiXactState->nextOffset += nmembers; LWLockRelease(MultiXactGenLock); @@ -885,15 +891,37 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset) } /* + * HandleMxactOffsetCornerCases + * Properly handle corner cases of MultiXactId enumeration + * + * This function takes a MultiXactId and returns a value that's actually a + * valid multi, that is, it skips the first two values of any segment- + * beginning page, which are used to store the truncateXid and + * truncateXidEpoch. + */ +static MultiXactId +HandleMxactOffsetCornerCases(MultiXactId multi) +{ + if (multi < FirstMultiXactId) + return FirstMultiXactId; + + if (MultiXactIdToOffsetEntry(multi) == 0 && + multi % SLRU_PAGES_PER_SEGMENT == 0) + return multi + 2; + + return multi; +} + +/* * GetMultiXactIdMembers - * Returns the set of TransactionIds that make up a MultiXactId + * Returns the set of MultiXactMembers that make up a MultiXactId * * We return -1 if the MultiXactId is too old to possibly have any members * still running; in that case we have not actually looked them up, and - * *xids is not set. + * *members is not set. */ int -GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) +GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members) { int pageno; int prev_pageno; @@ -904,64 +932,61 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) int length; int truelength; int i; + MultiXactId oldestMXact; MultiXactId nextMXact; MultiXactId tmpMXact; MultiXactOffset nextOffset; - TransactionId *ptr; + MultiXactMember *ptr; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); Assert(MultiXactIdIsValid(multi)); /* See if the MultiXactId is in the local cache */ - length = mXactCacheGetById(multi, xids); + length = mXactCacheGetById(multi, members); if (length >= 0) { debug_elog3(DEBUG2, "GetMembers: found %s in the cache", - mxid_to_string(multi, length, *xids)); + mxid_to_string(multi, length, *members)); return length; } - /* Set our OldestVisibleMXactId[] entry if we didn't already */ - MultiXactIdSetOldestVisible(); - /* * We check known limits on MultiXact before resorting to the SLRU area. * - * An ID older than our OldestVisibleMXactId[] entry can't possibly still - * be running, and we'd run the risk of trying to read already-truncated - * SLRU data if we did try to examine it. + * An ID older than MultiXactState->oldestMultiXactId cannot possibly be + * useful; it should have already been frozen by vacuum. We've truncated + * the on-disk structures anyway, so we return empty if such a value is + * queried. * * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is * seen, it implies undetected ID wraparound has occurred. We just * silently assume that such an ID is no longer running. * * Shared lock is enough here since we aren't modifying any global state. - * Also, we can examine our own OldestVisibleMXactId without the lock, - * since no one else is allowed to change it. - */ - if (MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId])) - { - debug_elog2(DEBUG2, "GetMembers: it's too old"); - *xids = NULL; - return -1; - } - - /* + * * Acquire the shared lock just long enough to grab the current counter * values. We may need both nextMXact and nextOffset; see below. */ LWLockAcquire(MultiXactGenLock, LW_SHARED); + oldestMXact = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; LWLockRelease(MultiXactGenLock); + if (MultiXactIdPrecedes(multi, oldestMXact)) + { + debug_elog2(DEBUG2, "GetMembers: it's too old"); + *members = NULL; + return -1; + } + if (!MultiXactIdPrecedes(multi, nextMXact)) { debug_elog2(DEBUG2, "GetMembers: it's too new!"); - *xids = NULL; + *members = NULL; return -1; } @@ -1026,9 +1051,8 @@ retry: { MultiXactOffset nextMXOffset; - /* handle wraparound if needed */ - if (tmpMXact < FirstMultiXactId) - tmpMXact = FirstMultiXactId; + /* Handle corner cases if needed */ + tmpMXact = HandleMxactOffsetCornerCases(tmpMXact); prev_pageno = pageno; @@ -1055,8 +1079,8 @@ retry: LWLockRelease(MultiXactOffsetControlLock); - ptr = (TransactionId *) palloc(length * sizeof(TransactionId)); - *xids = ptr; + ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); + *members = ptr; /* Now get the members themselves. */ LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); @@ -1066,9 +1090,13 @@ retry: for (i = 0; i < length; i++, offset++) { TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; pageno = MXOffsetToMemberPage(offset); - entryno = MXOffsetToMemberEntry(offset); + memberoff = MXOffsetToMemberOffset(offset); if (pageno != prev_pageno) { @@ -1077,8 +1105,7 @@ retry: } xactptr = (TransactionId *) - MultiXactMemberCtl->shared->page_buffer[slotno]; - xactptr += entryno; + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); if (!TransactionIdIsValid(*xactptr)) { @@ -1087,7 +1114,13 @@ retry: continue; } - ptr[truelength++] = *xactptr; + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + + ptr[truelength].xid = *xactptr; + ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + truelength++; } LWLockRelease(MultiXactMemberControlLock); @@ -1103,6 +1136,30 @@ retry: } /* + * mxactMemberComparator + * qsort comparison function for MultiXactMember + * + * We can't use wraparound comparison for XIDs because that does not respect + * the triangle inequality! Any old sort order will do. + */ +static int +mxactMemberComparator(const void *arg1, const void *arg2) +{ + MultiXactMember member1 = *(const MultiXactMember *) arg1; + MultiXactMember member2 = *(const MultiXactMember *) arg2; + + if (member1.xid > member2.xid) + return 1; + if (member1.xid < member2.xid) + return -1; + if (member1.status > member2.status) + return 1; + if (member1.status < member2.status) + return -1; + return 0; +} + +/* * mXactCacheGetBySet * returns a MultiXactId from the cache based on the set of * TransactionIds that compose it, or InvalidMultiXactId if @@ -1113,26 +1170,27 @@ retry: * for the majority of tuples, thus keeping MultiXactId usage low (saving * both I/O and wraparound issues). * - * NB: the passed xids[] array will be sorted in-place. + * NB: the passed members array will be sorted in-place. */ static MultiXactId -mXactCacheGetBySet(int nxids, TransactionId *xids) +mXactCacheGetBySet(int nmembers, MultiXactMember *members) { mXactCacheEnt *entry; debug_elog3(DEBUG2, "CacheGet: looking for %s", - mxid_to_string(InvalidMultiXactId, nxids, xids)); + mxid_to_string(InvalidMultiXactId, nmembers, members)); /* sort the array so comparison is easy */ - qsort(xids, nxids, sizeof(TransactionId), xidComparator); + qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); for (entry = MXactCache; entry != NULL; entry = entry->next) { - if (entry->nxids != nxids) + if (entry->nmembers != nmembers) continue; /* We assume the cache entries are sorted */ - if (memcmp(xids, entry->xids, nxids * sizeof(TransactionId)) == 0) + /* XXX we assume the unused bits in "status" are zeroed */ + if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0) { debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi); return entry->multi; @@ -1145,14 +1203,14 @@ mXactCacheGetBySet(int nxids, TransactionId *xids) /* * mXactCacheGetById - * returns the composing TransactionId set from the cache for a + * returns the composing MultiXactMember set from the cache for a * given MultiXactId, if present. * * If successful, *xids is set to the address of a palloc'd copy of the - * TransactionId set. Return value is number of members, or -1 on failure. + * MultiXactMember set. Return value is number of members, or -1 on failure. */ static int -mXactCacheGetById(MultiXactId multi, TransactionId **xids) +mXactCacheGetById(MultiXactId multi, MultiXactMember **members) { mXactCacheEnt *entry; @@ -1162,18 +1220,18 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids) { if (entry->multi == multi) { - TransactionId *ptr; + MultiXactMember *ptr; Size size; - size = sizeof(TransactionId) * entry->nxids; - ptr = (TransactionId *) palloc(size); - *xids = ptr; + size = sizeof(MultiXactMember) * entry->nmembers; + ptr = (MultiXactMember *) palloc(size); + *members = ptr; - memcpy(ptr, entry->xids, size); + memcpy(ptr, entry->members, size); debug_elog3(DEBUG2, "CacheGet: found %s", - mxid_to_string(multi, entry->nxids, entry->xids)); - return entry->nxids; + mxid_to_string(multi, entry->nmembers, entry->members)); + return entry->nmembers; } } @@ -1186,12 +1244,12 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids) * Add a new MultiXactId and its composing set into the local cache. */ static void -mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids) +mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) { mXactCacheEnt *entry; debug_elog3(DEBUG2, "CachePut: storing %s", - mxid_to_string(multi, nxids, xids)); + mxid_to_string(multi, nmembers, members)); if (MXactContext == NULL) { @@ -1206,15 +1264,15 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids) entry = (mXactCacheEnt *) MemoryContextAlloc(MXactContext, - offsetof(mXactCacheEnt, xids) + - nxids * sizeof(TransactionId)); + offsetof(mXactCacheEnt, members) + + nmembers * sizeof(MultiXactMember)); entry->multi = multi; - entry->nxids = nxids; - memcpy(entry->xids, xids, nxids * sizeof(TransactionId)); + entry->nmembers = nmembers; + memcpy(entry->members, members, nmembers * sizeof(MultiXactMember)); /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ - qsort(entry->xids, nxids, sizeof(TransactionId), xidComparator); + qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); entry->next = MXactCache; MXactCache = entry; @@ -1222,15 +1280,38 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids) #ifdef MULTIXACT_DEBUG static char * -mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids) +mxstatus_to_string(MultiXactStatus status) { - char *str = palloc(15 * (nxids + 1) + 4); + switch (status) + { + case MultiXactStatusForKeyShare: + return "keysh"; + case MultiXactStatusForShare: + return "sh"; + case MultiXactStatusForUpdate: + return "forupd"; + case MultiXactStatusUpdate: + return "upd"; + case MultiXactStatusKeyUpdate: + return "keyup"; + default: + elog(ERROR, "unrecognized multixact status %d", status); + return ""; + } +} + +static char * +mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) +{ + char *str = palloc(15 * (nmembers + 1) + 4); int i; - snprintf(str, 47, "%u %d[%u", multi, nxids, xids[0]); + snprintf(str, 47, "%u %d[%u (%s)", multi, nmembers, members[0].xid, + mxstatus_to_string(members[0].status)); - for (i = 1; i < nxids; i++) - snprintf(str + strlen(str), 17, ", %u", xids[i]); + for (i = 1; i < nmembers; i++) + snprintf(str + strlen(str), 17, ", %u (%s)", members[i].xid, + mxstatus_to_string(members[i].status)); strcat(str, "]"); return str; @@ -1247,16 +1328,6 @@ void AtEOXact_MultiXact(void) { /* - * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of - * which should only be valid while within a transaction. - * - * We assume that storing a MultiXactId is atomic and so we need not take - * MultiXactGenLock to do this. - */ - OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; - OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; - - /* * Discard the local MultiXactId cache. Since MXactContext was created as * a child of TopTransactionContext, we needn't delete it explicitly. */ @@ -1267,18 +1338,11 @@ AtEOXact_MultiXact(void) /* * AtPrepare_MultiXact * Save multixact state at 2PC tranasction prepare - * - * In this phase, we only store our OldestMemberMXactId value in the two-phase - * state file. */ void AtPrepare_MultiXact(void) { - MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId]; - - if (MultiXactIdIsValid(myOldestMember)) - RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0, - &myOldestMember, sizeof(MultiXactId)); + /* nothing to do */ } /* @@ -1288,41 +1352,6 @@ AtPrepare_MultiXact(void) void PostPrepare_MultiXact(TransactionId xid) { - MultiXactId myOldestMember; - - /* - * Transfer our OldestMemberMXactId value to the slot reserved for the - * prepared transaction. - */ - myOldestMember = OldestMemberMXactId[MyBackendId]; - if (MultiXactIdIsValid(myOldestMember)) - { - BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid); - - /* - * Even though storing MultiXactId is atomic, acquire lock to make - * sure others see both changes, not just the reset of the slot of the - * current backend. Using a volatile pointer might suffice, but this - * isn't a hot spot. - */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - - OldestMemberMXactId[dummyBackendId] = myOldestMember; - OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; - - LWLockRelease(MultiXactGenLock); - } - - /* - * We don't need to transfer OldestVisibleMXactId value, because the - * transaction is not going to be looking at any more multixacts once it's - * prepared. - * - * We assume that storing a MultiXactId is atomic and so we need not take - * MultiXactGenLock to do this. - */ - OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; - /* * Discard the local MultiXactId cache like in AtEOX_MultiXact */ @@ -1338,17 +1367,7 @@ void multixact_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len) { - BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid); - MultiXactId oldestMember; - - /* - * Get the oldest member XID from the state file record, and set it in the - * OldestMemberMXactId slot reserved for this prepared transaction. - */ - Assert(len == sizeof(MultiXactId)); - oldestMember = *((MultiXactId *) recdata); - - OldestMemberMXactId[dummyBackendId] = oldestMember; + /* nothing to do */ } /* @@ -1359,11 +1378,7 @@ void multixact_twophase_postcommit(TransactionId xid, uint16 info, void *recdata, uint32 len) { - BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid); - - Assert(len == sizeof(MultiXactId)); - - OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId; + /* nothing to do */ } /* @@ -1374,7 +1389,7 @@ void multixact_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len) { - multixact_twophase_postcommit(xid, info, recdata, len); + /* nothing to do */ } /* @@ -1387,11 +1402,7 @@ MultiXactShmemSize(void) { Size size; -#define SHARED_MULTIXACT_STATE_SIZE \ - add_size(sizeof(MultiXactStateData), \ - mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) - - size = SHARED_MULTIXACT_STATE_SIZE; + size = sizeof(MultiXactStateData); size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0)); size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0)); @@ -1417,24 +1428,17 @@ MultiXactShmemInit(void) /* Initialize our shared state struct */ MultiXactState = ShmemInitStruct("Shared MultiXact State", - SHARED_MULTIXACT_STATE_SIZE, + sizeof(MultiXactStateData), &found); if (!IsUnderPostmaster) { Assert(!found); /* Make sure we zero out the per-backend state */ - MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); + MemSet(MultiXactState, 0, sizeof(MultiXactStateData)); } else Assert(found); - - /* - * Set up array pointers. Note that perBackendXactIds[0] is wasted space - * since we only use indexes 1..MaxOldestSlot in each array. - */ - OldestMemberMXactId = MultiXactState->perBackendXactIds; - OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot; } /* @@ -1450,7 +1454,7 @@ BootStrapMultiXact(void) LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); /* Create and zero the first page of the offsets log */ - slotno = ZeroMultiXactOffsetPage(0, false); + slotno = ZeroMultiXactOffsetPage(0, false, InvalidTransactionId, 0); /* Make sure it's written out */ SimpleLruWritePage(MultiXactOffsetCtl, slotno); @@ -1474,26 +1478,40 @@ BootStrapMultiXact(void) * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. * If writeXlog is TRUE, also emit an XLOG record saying we did this. * + * If truncateXid is valid, store it in the first position of the page. + * * The page is not actually written, just set up in shared memory. * The slot number of the new page is returned. * * Control lock must be held at entry, and will be held at exit. */ static int -ZeroMultiXactOffsetPage(int pageno, bool writeXlog) +ZeroMultiXactOffsetPage(int pageno, bool writeXlog, TransactionId truncateXid, + uint32 truncateXidEpoch) { int slotno; slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); + WriteMZeroOffsetPageXlogRec(pageno, truncateXid, truncateXidEpoch); + + if (TransactionIdIsValid(truncateXid)) + { + MultiXactOffset *offptr; + + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + *(offptr++) = truncateXid; + *offptr = truncateXidEpoch; + + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } return slotno; } /* - * Ditto, for MultiXactMember + * Ditto for MultiXactMember, except these don't worry about truncation info. */ static int ZeroMultiXactMemberPage(int pageno, bool writeXlog) @@ -1503,7 +1521,7 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog) slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); + WriteMZeroMemberPageXlogRec(pageno); return slotno; } @@ -1525,6 +1543,7 @@ StartupMultiXact(void) MultiXactOffset offset = MultiXactState->nextOffset; int pageno; int entryno; + int flagsoff; /* Clean up offsets state */ LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); @@ -1569,28 +1588,30 @@ StartupMultiXact(void) * Zero out the remainder of the current members page. See notes in * TrimCLOG() for motivation. */ - entryno = MXOffsetToMemberEntry(offset); - if (entryno != 0) + flagsoff = MXOffsetToFlagsOffset(offset); + if (flagsoff != 0) { int slotno; TransactionId *xidptr; + int memberoff; + memberoff = MXOffsetToMemberOffset(offset); slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); - xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno]; - xidptr += entryno; + xidptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - MemSet(xidptr, 0, BLCKSZ - (entryno * sizeof(TransactionId))); + MemSet(xidptr, 0, BLCKSZ - memberoff); + + /* + * Note: we don't need to zero out the flag bits in the remaining + * members of the current group, because they are always reset before + * writing. + */ MultiXactMemberCtl->shared->page_dirty[slotno] = true; } LWLockRelease(MultiXactMemberControlLock); - - /* - * Initialize lastTruncationPoint to invalid, ensuring that the first - * checkpoint will try to do truncation. - */ - MultiXactState->lastTruncationPoint = InvalidMultiXactId; } /* @@ -1607,22 +1628,31 @@ ShutdownMultiXact(void) } /* - * Get the next MultiXactId and offset to save in a checkpoint record + * Get the next MultiXactId, offset and truncate info to save in a checkpoint + * record */ void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, - MultiXactOffset *nextMultiOffset) + MultiXactOffset *nextMultiOffset, + TransactionId *oldestTruncateXid, + uint32 *oldestTruncateXidEpoch, + MultiXactId *oldestMulti) { LWLockAcquire(MultiXactGenLock, LW_SHARED); *nextMulti = MultiXactState->nextMXact; *nextMultiOffset = MultiXactState->nextOffset; + *oldestTruncateXid = MultiXactState->truncateXid; + *oldestTruncateXidEpoch = MultiXactState->truncateXidEpoch; + *oldestMulti = MultiXactState->oldestMultiXactId; LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "MultiXact: checkpoint is nextMulti %u, nextOffset %u", - *nextMulti, *nextMultiOffset); + debug_elog7(DEBUG2, + "MultiXact: checkpoint is nextMulti %u, nextOffset %u; truncate xid %u, epoch %u; oldest multi %u", + *nextMulti, *nextMultiOffset, *oldestTruncateXid, + *oldestTruncateXidEpoch, *oldestMulti); } /* @@ -1637,17 +1667,6 @@ CheckPointMultiXact(void) SimpleLruFlush(MultiXactOffsetCtl, true); SimpleLruFlush(MultiXactMemberCtl, true); - /* - * Truncate the SLRU files. This could be done at any time, but - * checkpoint seems a reasonable place for it. There is one exception: if - * we are called during xlog recovery, then shared->latest_page_number - * isn't valid (because StartupMultiXact hasn't been called yet) and so - * SimpleLruTruncate would get confused. It seems best not to risk - * removing any data during recovery anyway, so don't truncate. - */ - if (!RecoveryInProgress()) - TruncateMultiXact(); - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); } @@ -1670,7 +1689,7 @@ MultiXactSetNextMXact(MultiXactId nextMulti, /* * Ensure the next-to-be-assigned MultiXactId is at least minMulti, - * and similarly nextOffset is at least minMultiOffset + * and similarly nextOffset is at least minMultiOffset. * * This is used when we can determine minimum safe values from an XLog * record (either an on-line checkpoint or an mxact creation log entry). @@ -1696,6 +1715,9 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, /* * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. * + * If the newly allocated page is the first page on the segment, store an + * appropriate truncate Xid value in the page first position. + * * NB: this is called while holding MultiXactGenLock. We want it to be very * fast most of the time; even when it's not so fast, no actual I/O need * happen unless we're forced to write out a dirty log or xlog page to make @@ -1705,6 +1727,8 @@ static void ExtendMultiXactOffset(MultiXactId multi) { int pageno; + TransactionId truncateXid; + uint32 truncateXidEpoch; /* * No work except at first MultiXactId of a page. But beware: just after @@ -1716,12 +1740,49 @@ ExtendMultiXactOffset(MultiXactId multi) pageno = MultiXactIdToOffsetPage(multi); - LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + /* + * Determine the truncateXid and epoch that the new segment needs, if + * this is the first page of the segment. + */ + if (pageno % SLRU_PAGES_PER_SEGMENT == 0) + { + TransactionId nextXid; + + Assert(TransactionIdIsValid(RecentGlobalXmin)); + truncateXid = RecentGlobalXmin; + + GetNextXidAndEpoch(&nextXid, &truncateXidEpoch); + /* + * nextXid is certainly logically later than RecentGlobalXmin. So if + * it's numerically less, it must have wrapped into the next epoch. + */ + if (nextXid < truncateXid) + truncateXidEpoch--; + } + else + { + truncateXid = InvalidTransactionId; + truncateXidEpoch = 0; + } - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactOffsetPage(pageno, true); + LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + /* + * Zero the page, mark it with its truncate info, and make an XLOG entry + * about it. + */ + ZeroMultiXactOffsetPage(pageno, true, truncateXid, truncateXidEpoch); LWLockRelease(MultiXactOffsetControlLock); + + /* + * Finally, record the new truncation point in shared memory, if + * there isn't one already. + */ + if (!TransactionIdIsValid(MultiXactState->truncateXid)) + { + MultiXactState->truncateXid = truncateXid; + MultiXactState->truncateXidEpoch = truncateXidEpoch; + } } /* @@ -1742,13 +1803,16 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) */ while (nmembers > 0) { - int entryno; + int flagsoff; + int flagsbit; + int difference; /* * Only zero when at first entry of a page. */ - entryno = MXOffsetToMemberEntry(offset); - if (entryno == 0) + flagsoff = MXOffsetToFlagsOffset(offset); + flagsbit = MXOffsetToFlagsBitShift(offset); + if (flagsoff == 0 && flagsbit == 0) { int pageno; @@ -1763,122 +1827,241 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) } /* Advance to next page (OK if nmembers goes negative) */ - offset += (MULTIXACT_MEMBERS_PER_PAGE - entryno); - nmembers -= (MULTIXACT_MEMBERS_PER_PAGE - entryno); + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + offset += difference; + nmembers -= difference; + } +} + +/* + * Complete a SegmentInfo with the truncate Xid and epoch, as read from its + * first page. + */ +static void +fillSegmentInfoData(SlruCtl ctl, SegmentInfo *segment) +{ + int slotno; + MultiXactId *offptr; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + /* FIXME it'd be nice not to trash the entire SLRU cache while at this */ + slotno = SimpleLruReadPage_ReadOnly(ctl, segment->segno, InvalidTransactionId); + offptr = (MultiXactId *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + segment->truncateXid = *offptr; + offptr++; + segment->truncateXidEpoch = *offptr; + offptr++; + segment->firstOffset = *offptr; + LWLockRelease(ctl->shared->ControlLock); +} + +/* SegmentInfo comparator, for qsort and bsearch */ +static int +compareTruncateXidEpoch(const void *a, const void *b) +{ + const SegmentInfo *sega = (const SegmentInfo *) a; + const SegmentInfo *segb = (const SegmentInfo *) b; + uint32 epocha = sega->truncateXidEpoch; + uint32 epochb = segb->truncateXidEpoch; + TransactionId xida = sega->truncateXid; + TransactionId xidb = segb->truncateXid; + + if (epocha < epochb) + return -1; + if (epocha > epochb) + return 1; + if (xida < xidb) + return -1; + if (xida > xidb) + return 1; + return 0; +} + +/* + * SlruScanDirectory callback + * This callback is in charge of scanning all existing segments, + * to determine their respective truncation points. + * + * This does not delete any segments. + */ +static bool +mxactSlruGathererCb(SlruCtl ctl, char *segname, int segpage, + void *data) +{ + TruncateCbData *truncdata = (TruncateCbData *) data; + SegmentInfo seg; + + /* + * Keep track of the truncate Xid and other data for the caller to sort out + * the new truncation point. + */ + seg.segno = segpage % SLRU_PAGES_PER_SEGMENT; + fillSegmentInfoData(ctl, &seg); + + if (truncdata->remaining == NULL) + { + truncdata->remaining_alloc = 8; + truncdata->remaining_used = 0; + truncdata->remaining = palloc(truncdata->remaining_alloc * + sizeof(SegmentInfo)); } + else if (truncdata->remaining_used == truncdata->remaining_alloc - 1) + { + truncdata->remaining_alloc *= 2; + truncdata->remaining = repalloc(truncdata->remaining, + truncdata->remaining_alloc); + } + truncdata->remaining[truncdata->remaining_used++] = seg; + + return false; /* keep going */ } /* * Remove all MultiXactOffset and MultiXactMember segments before the oldest * ones still of interest. * + * The truncation rules for the Offset SLRU area are: + * + * 1. the current segment is never to be deleted. + * 2. for all the remaining segments, keep track of their respective number + * and truncate Xid info. The caller is to determine the new truncation + * point from this data. + * * This is called only during checkpoints. We assume no more than one * backend does this at a time. * * XXX do we have any issues with needing to checkpoint here? */ -static void -TruncateMultiXact(void) +void +TruncateMultiXact(TransactionId frozenXid) { - MultiXactId nextMXact; - MultiXactOffset nextOffset; - MultiXactId oldestMXact; - MultiXactOffset oldestOffset; + TransactionId currentXid; + uint32 frozenXidEpoch; + TruncateCbData truncdata; + SegmentInfo *truncateSegment; + SegmentInfo frozenPosition; int cutoffPage; int i; + TransactionId newTruncateXid; + int newTruncateXidEpoch; /* - * First, compute where we can safely truncate. Per notes above, this is - * the oldest valid value among all the OldestMemberMXactId[] and - * OldestVisibleMXactId[] entries, or nextMXact if none are valid. + * Quick exit #1: if the truncateXid is not valid, bail out. We do this + * check without a lock so that it's fast in the common case when there's + * only one segment (which cannot be removed). If a concurrent backend is + * creating a new segment, no problem: it just means we delay removing + * files until we're next called. This assumes that storing an aligned + * 32-bit value is atomic. */ - LWLockAcquire(MultiXactGenLock, LW_SHARED); + if (!TransactionIdIsValid(MultiXactState->truncateXid)) + return; /* - * We have to beware of the possibility that nextMXact is in the - * wrapped-around state. We don't fix the counter itself here, but we - * must be sure to use a valid value in our calculation. + * Compute the epoch corresponding to the frozenXid value we were given. + * + * The current Xid value must be logically newer than frozenXid, so if it's + * numerically lower, it must belong to the next epoch. */ - nextMXact = MultiXactState->nextMXact; - if (nextMXact < FirstMultiXactId) - nextMXact = FirstMultiXactId; + GetNextXidAndEpoch(¤tXid, &frozenXidEpoch); + if (currentXid < frozenXid) + frozenXidEpoch--; - oldestMXact = nextMXact; - for (i = 1; i <= MaxOldestSlot; i++) + /* + * Quick exit #2: the oldest segment is not yet old enough to be removed. + * In that case we don't need to scan the whole directory. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + Assert(frozenXidEpoch >= MultiXactState->truncateXidEpoch); + if ((frozenXidEpoch == MultiXactState->truncateXidEpoch) && + (frozenXid < MultiXactState->truncateXid)) { - MultiXactId thisoldest; - - thisoldest = OldestMemberMXactId[i]; - if (MultiXactIdIsValid(thisoldest) && - MultiXactIdPrecedes(thisoldest, oldestMXact)) - oldestMXact = thisoldest; - thisoldest = OldestVisibleMXactId[i]; - if (MultiXactIdIsValid(thisoldest) && - MultiXactIdPrecedes(thisoldest, oldestMXact)) - oldestMXact = thisoldest; + LWLockRelease(MultiXactGenLock); + return; } - - /* Save the current nextOffset too */ - nextOffset = MultiXactState->nextOffset; - LWLockRelease(MultiXactGenLock); - debug_elog3(DEBUG2, "MultiXact: truncation point = %u", oldestMXact); - /* - * If we already truncated at this point, do nothing. This saves time - * when no MultiXacts are getting used, which is probably not uncommon. + * Have our callback scan the SLRU directory to let us determine the + * truncation point. */ - if (MultiXactState->lastTruncationPoint == oldestMXact) - return; + truncdata.remaining_used = 0; + truncdata.remaining_alloc = 0; + truncdata.remaining = NULL; + SlruScanDirectory(MultiXactOffsetCtl, mxactSlruGathererCb, &truncdata); /* - * We need to determine where to truncate MultiXactMember. If we found a - * valid oldest MultiXactId, read its starting offset; otherwise we use - * the nextOffset value we saved above. + * Determine the maximum segment whose truncateXid is less than the + * truncate point. */ - if (oldestMXact == nextMXact) - oldestOffset = nextOffset; - else + frozenPosition.truncateXid = frozenXid; + frozenPosition.truncateXidEpoch = frozenXidEpoch; + truncateSegment = NULL; + for (i = 0; i < truncdata.remaining_used; i++) { - int pageno; - int slotno; - int entryno; - MultiXactOffset *offptr; + if ((compareTruncateXidEpoch(&frozenPosition, + &(truncdata.remaining[i])) > 0) && + (truncateSegment->segno < truncdata.remaining[i].segno)) + { + truncateSegment = &(truncdata.remaining[i]); + } + } - /* lock is acquired by SimpleLruReadPage_ReadOnly */ + /* + * Nothing to delete? This shouldn't happen, due to quick exit #2 above, + * but we'd better cope. + */ + if (truncateSegment == NULL) + return; - pageno = MultiXactIdToOffsetPage(oldestMXact); - entryno = MultiXactIdToOffsetEntry(oldestMXact); + /* truncate MultiXactOffset */ + SimpleLruTruncate(MultiXactOffsetCtl, firstPageOf(truncateSegment->segno)); - slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, oldestMXact); - offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; - offptr += entryno; - oldestOffset = *offptr; + /* + * And truncate MultiXactMember at the first offset used by the oldest + * remaining segment. + */ + cutoffPage = MXOffsetToMemberPage(truncateSegment->firstOffset); - LWLockRelease(MultiXactOffsetControlLock); - } + SimpleLruTruncate(MultiXactMemberCtl, cutoffPage); /* - * The cutoff point is the start of the segment containing oldestMXact. We - * pass the *page* containing oldestMXact to SimpleLruTruncate. + * Finally, update shared memory to keep track of the next usable + * truncation point, if any. If the truncation point for offsets was the + * last remaining segment, then there's no next truncation point: it will + * be set when the next segment is created. Otherwise, the second + * remaining segment determines the next truncation point. */ - cutoffPage = MultiXactIdToOffsetPage(oldestMXact); + newTruncateXid = InvalidTransactionId; + newTruncateXidEpoch = 0; + for (i = 0; i < truncdata.remaining_used; i++) + { + if (truncdata.remaining[i].segno == truncateSegment->segno + 1) + { + newTruncateXid = truncdata.remaining[i].truncateXid; + newTruncateXidEpoch = truncdata.remaining[i].truncateXidEpoch; + break; + } + } - SimpleLruTruncate(MultiXactOffsetCtl, cutoffPage); + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); /* - * Also truncate MultiXactMember at the previously determined offset. + * FIXME there's a race condition here: somebody might have created a new + * segment after we finished scanning the dir. That scenario would leave + * us with an invalid truncateXid in shared memory, which is not an easy + * situation to get out of. Needs more thought. */ - cutoffPage = MXOffsetToMemberPage(oldestOffset); - SimpleLruTruncate(MultiXactMemberCtl, cutoffPage); + MultiXactState->truncateXid = newTruncateXid; + MultiXactState->truncateXidEpoch = newTruncateXidEpoch; /* - * Set the last known truncation point. We don't need a lock for this - * since only one backend does checkpoints at a time. + * we also set the oldest visible MultiXactId to the frozenXid value we + * were given; although the segments we kept may have values earlier than + * that, they are not supposed to remain on disk anyway. */ - MultiXactState->lastTruncationPoint = oldestMXact; + MultiXactState->oldestMultiXactId = frozenXid; + LWLockRelease(MultiXactGenLock); } /* @@ -1947,13 +2130,29 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) return (diff < 0); } +static void +WriteMZeroOffsetPageXlogRec(int pageno, TransactionId truncateXid, + uint32 truncateXidEpoch) +{ + XLogRecData rdata; + MxactZeroOffPg zerooff; + + zerooff.pageno = pageno; + zerooff.truncateXid = truncateXid; + zerooff.truncateXidEpoch = truncateXidEpoch; + + rdata.data = (char *) (&zerooff); + rdata.len = sizeof(MxactZeroOffPg); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE, &rdata); +} /* - * Write an xlog record reflecting the zeroing of either a MEMBERs or - * OFFSETs page (info shows which) + * Write an xlog record reflecting the zeroing of either a MEMBERs page. */ static void -WriteMZeroPageXlogRec(int pageno, uint8 info) +WriteMZeroMemberPageXlogRec(int pageno) { XLogRecData rdata; @@ -1961,7 +2160,7 @@ WriteMZeroPageXlogRec(int pageno, uint8 info) rdata.len = sizeof(int); rdata.buffer = InvalidBuffer; rdata.next = NULL; - (void) XLogInsert(RM_MULTIXACT_ID, info, &rdata); + (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_MEM_PAGE, &rdata); } /* @@ -1977,18 +2176,25 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { - int pageno; + MxactZeroOffPg zerooff; int slotno; - memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + memcpy(&zerooff, XLogRecGetData(record), sizeof(MxactZeroOffPg)); LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); - slotno = ZeroMultiXactOffsetPage(pageno, false); + slotno = ZeroMultiXactOffsetPage(zerooff.pageno, false, + zerooff.truncateXid, + zerooff.truncateXidEpoch); SimpleLruWritePage(MultiXactOffsetCtl, slotno); Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); LWLockRelease(MultiXactOffsetControlLock); + + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + if (!TransactionIdIsValid(MultiXactState->truncateXid)) + MultiXactState->truncateXid = zerooff.truncateXid; + LWLockRelease(MultiXactGenLock); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { @@ -2008,15 +2214,18 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) else if (info == XLOG_MULTIXACT_CREATE_ID) { xl_multixact_create *xlrec = (xl_multixact_create *) XLogRecGetData(record); - TransactionId *xids = xlrec->xids; + MultiXactMember *members = xlrec->members; TransactionId max_xid; int i; /* Store the data back into the SLRU files */ - RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids); + RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, members); - /* Make sure nextMXact/nextOffset are beyond what this record has */ - MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids); + /* + * Make sure nextMXact/nextOffset are beyond what this record has. + * We cannot compute a truncateXid from this. + */ + MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nmembers); /* * Make sure nextXid is beyond any XID mentioned in the record. This @@ -2024,10 +2233,10 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) * evidence in the XLOG, but let's be safe. */ max_xid = record->xl_xid; - for (i = 0; i < xlrec->nxids; i++) + for (i = 0; i < xlrec->nmembers; i++) { - if (TransactionIdPrecedes(max_xid, xids[i])) - max_xid = xids[i]; + if (TransactionIdPrecedes(max_xid, members[i].xid)) + max_xid = members[i].xid; } /* @@ -2055,10 +2264,13 @@ multixact_desc(StringInfo buf, uint8 xl_info, char *rec) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { - int pageno; + MxactZeroOffPg zerooff; - memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "zero offsets page: %d", pageno); + memcpy(&zerooff, XLogRecGetData(rec), sizeof(MxactZeroOffPg)); + appendStringInfo(buf, "zero offsets page: %d truncate: %u/%u", + zerooff.pageno, + zerooff.truncateXidEpoch, + zerooff.truncateXid); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { @@ -2072,10 +2284,11 @@ multixact_desc(StringInfo buf, uint8 xl_info, char *rec) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; + /* XXX describe status too? */ appendStringInfo(buf, "create multixact %u offset %u:", xlrec->mid, xlrec->moff); - for (i = 0; i < xlrec->nxids; i++) - appendStringInfo(buf, " %u", xlrec->xids[i]); + for (i = 0; i < xlrec->nmembers; i++) + appendStringInfo(buf, " %u", xlrec->members[i].xid); } else appendStringInfo(buf, "UNKNOWN"); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 85f79b9..facf6f0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7792,7 +7792,10 @@ CreateCheckPoint(int flags) MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, - &checkPoint.nextMultiOffset); + &checkPoint.nextMultiOffset, + &checkPoint.oldestSegTruncateXid, + &checkPoint.oldestSegTruncateXidEpoch, + &checkPoint.oldestMultiXactId); /* * Having constructed the checkpoint record, ensure all shmem disk buffers @@ -7930,6 +7933,15 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestXmin(true, false)); + /* + * Also truncate pg_multixact if possible. We can throw away all data + * before the oldestXid value used by the most recent vacuum. As with + * subtrans, skip doing this during recovery, because StartupMultiXact + * hasn't been called yet. + */ + if (!RecoveryInProgress()) + TruncateMultiXact(checkPoint.oldestXid); + /* All real work is done, but log before releasing lock. */ if (log_checkpoints) LogCheckpointEnd(false); diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 99e130c..078073a 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3001,7 +3001,7 @@ reindex_relation(Oid relid, int flags) /* Ensure rd_indexattr is valid; see comments for RelationSetIndexList */ if (is_pg_class) - (void) RelationGetIndexAttrBitmap(rel); + (void) RelationGetIndexAttrBitmap(rel, false); PG_TRY(); { diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 32985a4..82f1aa7 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1150,6 +1150,7 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows, * right. (Note: this works out properly when the row was * both inserted and deleted in our xact.) */ + Assert(!(targtuple.t_data->t_infomask & HEAP_XMAX_IS_MULTI)); if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(targtuple.t_data))) deadrows += 1; else diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 54660f4..5d0cd9e 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1090,6 +1090,7 @@ read_info(SeqTable elm, Relation rel, Buffer *buf) * bit update, ie, don't bother to WAL-log it, since we can certainly do * this again if the update gets lost. */ + Assert(!(tuple.t_data->t_infomask & HEAP_XMAX_IS_MULTI)); if (HeapTupleHeaderGetXmax(tuple.t_data) != InvalidTransactionId) { HeapTupleHeaderSetXmax(tuple.t_data, InvalidTransactionId); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index a6e7268..7c1586f 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2578,7 +2578,7 @@ ltrmark:; test = heap_lock_tuple(relation, &tuple, &buffer, &update_ctid, &update_xmax, estate->es_output_cid, - LockTupleExclusive, false); + LockTupleUpdate, false); switch (test) { case HeapTupleSelfUpdated: diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index f42504c..37a1ca8 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -680,7 +680,7 @@ vac_update_datfrozenxid(void) * Initialize the "min" calculation with GetOldestXmin, which is a * reasonable approximation to the minimum relfrozenxid for not-yet- * committed pg_class entries for new tables; see AddNewRelationTuple(). - * Se we cannot produce a wrong minimum by starting with this. + * So we cannot produce a wrong minimum by starting with this. */ newFrozenXid = GetOldestXmin(true, true); diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index fd7a9ed..d018a95 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -800,7 +800,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) } /* - * Similarly, we have to lock relations selected FOR UPDATE/FOR SHARE + * Similarly, we have to lock relations selected FOR UPDATE/SHARE/KEY SHARE * before we initialize the plan tree, else we'd be risking lock upgrades. * While we are at it, build the ExecRowMark list. */ @@ -820,6 +820,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) { case ROW_MARK_EXCLUSIVE: case ROW_MARK_SHARE: + case ROW_MARK_KEYSHARE: relid = getrelid(rc->rti, rangeTable); relation = heap_open(relid, RowShareLock); break; @@ -1691,7 +1692,7 @@ EvalPlanQual(EState *estate, EPQState *epqstate, /* * Get and lock the updated version of the row; if fail, return NULL. */ - copyTuple = EvalPlanQualFetch(estate, relation, LockTupleExclusive, + copyTuple = EvalPlanQualFetch(estate, relation, LockTupleUpdate, tid, priorXmax); if (copyTuple == NULL) @@ -1929,7 +1930,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, /* updated, so look at the updated row */ tuple.t_self = tuple.t_data->t_ctid; /* updated row should have xmin matching this xmax */ - priorXmax = HeapTupleHeaderGetXmax(tuple.t_data); + priorXmax = HeapTupleHeaderGetUpdateXid(tuple.t_data); ReleaseBuffer(buffer); /* loop back to fetch next in chain */ } diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index 0c48b6b..892fee5 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -111,10 +111,22 @@ lnext: tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); /* okay, try to lock the tuple */ - if (erm->markType == ROW_MARK_EXCLUSIVE) - lockmode = LockTupleExclusive; - else - lockmode = LockTupleShared; + switch (erm->markType) + { + case ROW_MARK_EXCLUSIVE: + lockmode = LockTupleUpdate; + break; + case ROW_MARK_SHARE: + lockmode = LockTupleShare; + break; + case ROW_MARK_KEYSHARE: + lockmode = LockTupleKeyShare; + break; + default: + elog(ERROR, "unsupported rowmark type"); + lockmode = LockTupleUpdate; /* keep compiler quiet */ + break; + } test = heap_lock_tuple(erm->relation, &tuple, &buffer, &update_ctid, &update_xmax, diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 63958c3..4345e84 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2028,7 +2028,7 @@ _copyRowMarkClause(RowMarkClause *from) RowMarkClause *newnode = makeNode(RowMarkClause); COPY_SCALAR_FIELD(rti); - COPY_SCALAR_FIELD(forUpdate); + COPY_SCALAR_FIELD(strength); COPY_SCALAR_FIELD(noWait); COPY_SCALAR_FIELD(pushedDown); @@ -2387,7 +2387,7 @@ _copyLockingClause(LockingClause *from) LockingClause *newnode = makeNode(LockingClause); COPY_NODE_FIELD(lockedRels); - COPY_SCALAR_FIELD(forUpdate); + COPY_SCALAR_FIELD(strength); COPY_SCALAR_FIELD(noWait); return newnode; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index f3a34a1..0f3f914 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2300,7 +2300,7 @@ static bool _equalLockingClause(LockingClause *a, LockingClause *b) { COMPARE_NODE_FIELD(lockedRels); - COMPARE_SCALAR_FIELD(forUpdate); + COMPARE_SCALAR_FIELD(strength); COMPARE_SCALAR_FIELD(noWait); return true; @@ -2371,7 +2371,7 @@ static bool _equalRowMarkClause(RowMarkClause *a, RowMarkClause *b) { COMPARE_SCALAR_FIELD(rti); - COMPARE_SCALAR_FIELD(forUpdate); + COMPARE_SCALAR_FIELD(strength); COMPARE_SCALAR_FIELD(noWait); COMPARE_SCALAR_FIELD(pushedDown); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index f7d39ed..5340c07 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2077,7 +2077,7 @@ _outLockingClause(StringInfo str, LockingClause *node) WRITE_NODE_TYPE("LOCKINGCLAUSE"); WRITE_NODE_FIELD(lockedRels); - WRITE_BOOL_FIELD(forUpdate); + WRITE_ENUM_FIELD(strength, LockClauseStrength); WRITE_BOOL_FIELD(noWait); } @@ -2255,7 +2255,7 @@ _outRowMarkClause(StringInfo str, RowMarkClause *node) WRITE_NODE_TYPE("ROWMARKCLAUSE"); WRITE_UINT_FIELD(rti); - WRITE_BOOL_FIELD(forUpdate); + WRITE_ENUM_FIELD(strength, LockClauseStrength); WRITE_BOOL_FIELD(noWait); WRITE_BOOL_FIELD(pushedDown); } diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 29a0e8f..7c08964 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -301,7 +301,7 @@ _readRowMarkClause(void) READ_LOCALS(RowMarkClause); READ_UINT_FIELD(rti); - READ_BOOL_FIELD(forUpdate); + READ_ENUM_FIELD(strength, LockClauseStrength); READ_BOOL_FIELD(noWait); READ_BOOL_FIELD(pushedDown); diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 5b170b3..81b0be1 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -564,11 +564,11 @@ make_outerjoininfo(PlannerInfo *root, Assert(jointype != JOIN_RIGHT); /* - * Presently the executor cannot support FOR UPDATE/SHARE marking of rels + * Presently the executor cannot support FOR UPDATE/SHARE/KEY SHARE marking of rels * appearing on the nullable side of an outer join. (It's somewhat unclear * what that would mean, anyway: what should we mark when a result row is * generated from no element of the nullable relation?) So, complain if - * any nullable rel is FOR UPDATE/SHARE. + * any nullable rel is FOR UPDATE/SHARE/KEY SHARE. * * You might be wondering why this test isn't made far upstream in the * parser. It's because the parser hasn't got enough info --- consider @@ -586,7 +586,7 @@ make_outerjoininfo(PlannerInfo *root, (jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels))) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be applied to the nullable side of an outer join"))); + errmsg("SELECT FOR UPDATE/SHARE/KEY SHARE cannot be applied to the nullable side of an outer join"))); } sjinfo->syn_lefthand = left_rels; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 5c18b72..5c83d10 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1927,7 +1927,7 @@ preprocess_rowmarks(PlannerInfo *root) if (parse->rowMarks) { /* - * We've got trouble if FOR UPDATE/SHARE appears inside grouping, + * We've got trouble if FOR UPDATE/SHARE/KEY SHARE appears inside grouping, * since grouping renders a reference to individual tuple CTIDs * invalid. This is also checked at parse time, but that's * insufficient because of rule substitution, query pullup, etc. @@ -1937,7 +1937,7 @@ preprocess_rowmarks(PlannerInfo *root) else { /* - * We only need rowmarks for UPDATE, DELETE, or FOR UPDATE/SHARE. + * We only need rowmarks for UPDATE, DELETE, or FOR UPDATE/SHARE/KEY SHARE. */ if (parse->commandType != CMD_UPDATE && parse->commandType != CMD_DELETE) @@ -1947,7 +1947,7 @@ preprocess_rowmarks(PlannerInfo *root) /* * We need to have rowmarks for all base relations except the target. We * make a bitmapset of all base rels and then remove the items we don't - * need or have FOR UPDATE/SHARE marks for. + * need or have FOR UPDATE/SHARE/KEY SHARE marks for. */ rels = get_base_rel_indexes((Node *) parse->jointree); if (parse->resultRelation) @@ -1984,10 +1984,20 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = rc->rti; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - if (rc->forUpdate) - newrc->markType = ROW_MARK_EXCLUSIVE; - else - newrc->markType = ROW_MARK_SHARE; + switch (rc->strength) + { + case LCS_FORUPDATE: + newrc->markType = ROW_MARK_EXCLUSIVE; + break; + case LCS_FORSHARE: + newrc->markType = ROW_MARK_SHARE; + break; + case LCS_FORKEYSHARE: + newrc->markType = ROW_MARK_KEYSHARE; + break; + default: + elog(ERROR, "unsupported rowmark type %d", rc->strength); + } newrc->noWait = rc->noWait; newrc->isParent = false; diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index e4a4e3a..e2ff39f 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -2310,7 +2310,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc, /* make a clause we can pass down to subqueries to select all rels */ allrels = makeNode(LockingClause); allrels->lockedRels = NIL; /* indicates all rels */ - allrels->forUpdate = lc->forUpdate; + allrels->strength = lc->strength; allrels->noWait = lc->noWait; if (lockedRels == NIL) @@ -2329,12 +2329,12 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc, if (rte->relkind == RELKIND_FOREIGN_TABLE) break; applyLockingClause(qry, i, - lc->forUpdate, lc->noWait, pushedDown); + lc->strength, lc->noWait, pushedDown); rte->requiredPerms |= ACL_SELECT_FOR_UPDATE; break; case RTE_SUBQUERY: applyLockingClause(qry, i, - lc->forUpdate, lc->noWait, pushedDown); + lc->strength, lc->noWait, pushedDown); /* * FOR UPDATE/SHARE of subquery is propagated to all of @@ -2384,13 +2384,13 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc, rte->eref->aliasname), parser_errposition(pstate, thisrel->location))); applyLockingClause(qry, i, - lc->forUpdate, lc->noWait, + lc->strength, lc->noWait, pushedDown); rte->requiredPerms |= ACL_SELECT_FOR_UPDATE; break; case RTE_SUBQUERY: applyLockingClause(qry, i, - lc->forUpdate, lc->noWait, + lc->strength, lc->noWait, pushedDown); /* see comment above */ transformLockingClause(pstate, rte->subquery, @@ -2443,7 +2443,7 @@ transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc, */ void applyLockingClause(Query *qry, Index rtindex, - bool forUpdate, bool noWait, bool pushedDown) + LockClauseStrength strength, bool noWait, bool pushedDown) { RowMarkClause *rc; @@ -2455,10 +2455,10 @@ applyLockingClause(Query *qry, Index rtindex, if ((rc = get_parse_rowmark(qry, rtindex)) != NULL) { /* - * If the same RTE is specified both FOR UPDATE and FOR SHARE, treat - * it as FOR UPDATE. (Reasonable, since you can't take both a shared - * and exclusive lock at the same time; it'll end up being exclusive - * anyway.) + * If the same RTE is specified for more than one locking strength, + * treat is as the strongest. (Reasonable, since you can't take both a + * shared and exclusive lock at the same time; it'll end up being + * exclusive anyway.) * * We also consider that NOWAIT wins if it's specified both ways. This * is a bit more debatable but raising an error doesn't seem helpful. @@ -2467,7 +2467,7 @@ applyLockingClause(Query *qry, Index rtindex, * * And of course pushedDown becomes false if any clause is explicit. */ - rc->forUpdate |= forUpdate; + rc->strength = Max(rc->strength, strength); rc->noWait |= noWait; rc->pushedDown &= pushedDown; return; @@ -2476,7 +2476,7 @@ applyLockingClause(Query *qry, Index rtindex, /* Make a new RowMarkClause */ rc = makeNode(RowMarkClause); rc->rti = rtindex; - rc->forUpdate = forUpdate; + rc->strength = strength; rc->noWait = noWait; rc->pushedDown = pushedDown; qry->rowMarks = lappend(qry->rowMarks, rc); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c135465..1eb9962 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -8786,7 +8786,7 @@ for_locking_item: { LockingClause *n = makeNode(LockingClause); n->lockedRels = $3; - n->forUpdate = TRUE; + n->strength = LCS_FORUPDATE; n->noWait = $4; $$ = (Node *) n; } @@ -8794,10 +8794,18 @@ for_locking_item: { LockingClause *n = makeNode(LockingClause); n->lockedRels = $3; - n->forUpdate = FALSE; + n->strength = LCS_FORSHARE; n->noWait = $4; $$ = (Node *) n; } + | FOR KEY SHARE locked_rels_list opt_nowait + { + LockingClause *n = makeNode(LockingClause); + n->lockedRels = $4; + n->strength = LCS_FORKEYSHARE; + n->noWait = $5; + $$ = (Node *) n; + } ; locked_rels_list: diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 3b31108..dc14a0d 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -55,7 +55,7 @@ static void rewriteValuesRTE(RangeTblEntry *rte, Relation target_relation, static void rewriteTargetListUD(Query *parsetree, RangeTblEntry *target_rte, Relation target_relation); static void markQueryForLocking(Query *qry, Node *jtnode, - bool forUpdate, bool noWait, bool pushedDown); + LockClauseStrength strength, bool noWait, bool pushedDown); static List *matchLocks(CmdType event, RuleLock *rulelocks, int varno, Query *parsetree); static Query *fireRIRrules(Query *parsetree, List *activeRIRs, @@ -1401,8 +1401,8 @@ ApplyRetrieveRule(Query *parsetree, rte->modifiedCols = NULL; /* - * If FOR UPDATE/SHARE of view, mark all the contained tables as implicit - * FOR UPDATE/SHARE, the same as the parser would have done if the view's + * If FOR UPDATE/SHARE/KEY SHARE of view, mark all the contained tables as implicit + * FOR UPDATE/SHARE/KEY SHARE, the same as the parser would have done if the view's * subquery had been written out explicitly. * * Note: we don't consider forUpdatePushedDown here; such marks will be @@ -1410,13 +1410,13 @@ ApplyRetrieveRule(Query *parsetree, */ if (rc != NULL) markQueryForLocking(rule_action, (Node *) rule_action->jointree, - rc->forUpdate, rc->noWait, true); + rc->strength, rc->noWait, true); return parsetree; } /* - * Recursively mark all relations used by a view as FOR UPDATE/SHARE. + * Recursively mark all relations used by a view as FOR UPDATE/SHARE/KEY SHARE. * * This may generate an invalid query, eg if some sub-query uses an * aggregate. We leave it to the planner to detect that. @@ -1428,7 +1428,7 @@ ApplyRetrieveRule(Query *parsetree, */ static void markQueryForLocking(Query *qry, Node *jtnode, - bool forUpdate, bool noWait, bool pushedDown) + LockClauseStrength strength, bool noWait, bool pushedDown) { if (jtnode == NULL) return; @@ -1442,16 +1442,16 @@ markQueryForLocking(Query *qry, Node *jtnode, /* ignore foreign tables */ if (rte->relkind != RELKIND_FOREIGN_TABLE) { - applyLockingClause(qry, rti, forUpdate, noWait, pushedDown); + applyLockingClause(qry, rti, strength, noWait, pushedDown); rte->requiredPerms |= ACL_SELECT_FOR_UPDATE; } } else if (rte->rtekind == RTE_SUBQUERY) { - applyLockingClause(qry, rti, forUpdate, noWait, pushedDown); - /* FOR UPDATE/SHARE of subquery is propagated to subquery's rels */ + applyLockingClause(qry, rti, strength, noWait, pushedDown); + /* FOR UPDATE/SHARE/KEY SHARE of subquery is propagated to subquery's rels */ markQueryForLocking(rte->subquery, (Node *) rte->subquery->jointree, - forUpdate, noWait, true); + strength, noWait, true); } /* other RTE types are unaffected by FOR UPDATE */ } @@ -1461,14 +1461,14 @@ markQueryForLocking(Query *qry, Node *jtnode, ListCell *l; foreach(l, f->fromlist) - markQueryForLocking(qry, lfirst(l), forUpdate, noWait, pushedDown); + markQueryForLocking(qry, lfirst(l), strength, noWait, pushedDown); } else if (IsA(jtnode, JoinExpr)) { JoinExpr *j = (JoinExpr *) jtnode; - markQueryForLocking(qry, j->larg, forUpdate, noWait, pushedDown); - markQueryForLocking(qry, j->rarg, forUpdate, noWait, pushedDown); + markQueryForLocking(qry, j->larg, strength, noWait, pushedDown); + markQueryForLocking(qry, j->rarg, strength, noWait, pushedDown); } else elog(ERROR, "unrecognized node type: %d", diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 345f6f5..45b7c7b 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -3869,9 +3869,10 @@ CheckForSerializableConflictOut(bool visible, Relation relation, case HEAPTUPLE_RECENTLY_DEAD: if (!visible) return; - xid = HeapTupleHeaderGetXmax(tuple->t_data); + xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); break; case HEAPTUPLE_DELETE_IN_PROGRESS: + Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); xid = HeapTupleHeaderGetXmax(tuple->t_data); break; case HEAPTUPLE_INSERT_IN_PROGRESS: diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 5b06333..65f629b 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -130,7 +130,7 @@ CommandIsReadOnly(Node *parsetree) if (stmt->intoClause != NULL) return false; /* SELECT INTO */ else if (stmt->rowMarks != NIL) - return false; /* SELECT FOR UPDATE/SHARE */ + return false; /* SELECT FOR UPDATE/SHARE/KEY SHARE */ else if (stmt->hasModifyingCTE) return false; /* data-modifying CTE */ else @@ -2147,10 +2147,21 @@ CreateCommandTag(Node *parsetree) else if (stmt->rowMarks != NIL) { /* not 100% but probably close enough */ - if (((PlanRowMark *) linitial(stmt->rowMarks))->markType == ROW_MARK_EXCLUSIVE) - tag = "SELECT FOR UPDATE"; - else - tag = "SELECT FOR SHARE"; + switch (((RowMarkClause *) linitial(stmt->rowMarks))->strength) + { + case LCS_FORUPDATE: + tag = "SELECT FOR UPDATE"; + break; + case LCS_FORSHARE: + tag = "SELECT FOR SHARE"; + break; + case LCS_FORKEYSHARE: + tag = "SELECT FOR KEY SHARE"; + break; + default: + tag = "???"; + break; + } } else tag = "SELECT"; @@ -2197,10 +2208,21 @@ CreateCommandTag(Node *parsetree) else if (stmt->rowMarks != NIL) { /* not 100% but probably close enough */ - if (((RowMarkClause *) linitial(stmt->rowMarks))->forUpdate) - tag = "SELECT FOR UPDATE"; - else - tag = "SELECT FOR SHARE"; + switch (((RowMarkClause *) linitial(stmt->rowMarks))->strength) + { + case LCS_FORUPDATE: + tag = "SELECT FOR UPDATE"; + break; + case LCS_FORSHARE: + tag = "SELECT FOR SHARE"; + break; + case LCS_FORKEYSHARE: + tag = "SELECT FOR KEY SHARE"; + break; + default: + tag = "???"; + break; + } } else tag = "SELECT"; diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 522a540..f4a4456 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -308,7 +308,7 @@ RI_FKey_check(PG_FUNCTION_ARGS) * Get the relation descriptors of the FK and PK tables. * * pk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = trigdata->tg_relation; pk_rel = heap_open(riinfo.pk_relid, RowShareLock); @@ -338,12 +338,12 @@ RI_FKey_check(PG_FUNCTION_ARGS) /* --------- * The query string built is - * SELECT 1 FROM ONLY + * SELECT 1 FROM ONLY x FOR KEY SHARE OF x * ---------- */ quoteRelationName(pkrelname, pk_rel); snprintf(querystr, sizeof(querystr), - "SELECT 1 FROM ONLY %s x FOR SHARE OF x", + "SELECT 1 FROM ONLY %s x FOR KEY SHARE OF x", pkrelname); /* Prepare and save the plan */ @@ -463,7 +463,8 @@ RI_FKey_check(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE pkatt1 = $1 [AND ...] FOR SHARE + * SELECT 1 FROM ONLY x WHERE pkatt1 = $1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding FK attributes. * ---------- @@ -487,7 +488,7 @@ RI_FKey_check(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = fk_type; } - appendStringInfo(&querybuf, " FOR SHARE OF x"); + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, @@ -625,7 +626,8 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel, /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE pkatt1 = $1 [AND ...] FOR SHARE + * SELECT 1 FROM ONLY x WHERE pkatt1 = $1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * PK attributes themselves. * ---------- @@ -648,7 +650,7 @@ ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel, querysep = "AND"; queryoids[i] = pk_type; } - appendStringInfo(&querybuf, " FOR SHARE OF x"); + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids, @@ -712,7 +714,7 @@ RI_FKey_noaction_del(PG_FUNCTION_ARGS) * Get the relation descriptors of the FK and PK tables and the old tuple. * * fk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = heap_open(riinfo.fk_relid, RowShareLock); pk_rel = trigdata->tg_relation; @@ -780,7 +782,8 @@ RI_FKey_noaction_del(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE $1 = fkatt1 [AND ...] + * SELECT 1 FROM ONLY x WHERE $1 = fkatt1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding PK attributes. * ---------- @@ -805,7 +808,7 @@ RI_FKey_noaction_del(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = pk_type; } - appendStringInfo(&querybuf, " FOR SHARE OF x"); + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, @@ -890,7 +893,7 @@ RI_FKey_noaction_upd(PG_FUNCTION_ARGS) * old tuple. * * fk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = heap_open(riinfo.fk_relid, RowShareLock); pk_rel = trigdata->tg_relation; @@ -993,7 +996,7 @@ RI_FKey_noaction_upd(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = pk_type; } - appendStringInfo(&querybuf, " FOR SHARE OF x"); + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, @@ -1431,7 +1434,7 @@ RI_FKey_restrict_del(PG_FUNCTION_ARGS) * Get the relation descriptors of the FK and PK tables and the old tuple. * * fk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = heap_open(riinfo.fk_relid, RowShareLock); pk_rel = trigdata->tg_relation; @@ -1489,7 +1492,8 @@ RI_FKey_restrict_del(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE $1 = fkatt1 [AND ...] + * SELECT 1 FROM ONLY x WHERE $1 = fkatt1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding PK attributes. * ---------- @@ -1514,7 +1518,7 @@ RI_FKey_restrict_del(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = pk_type; } - appendStringInfo(&querybuf, " FOR SHARE OF x"); + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, @@ -1604,7 +1608,7 @@ RI_FKey_restrict_upd(PG_FUNCTION_ARGS) * old tuple. * * fk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = heap_open(riinfo.fk_relid, RowShareLock); pk_rel = trigdata->tg_relation; @@ -1672,7 +1676,8 @@ RI_FKey_restrict_upd(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE $1 = fkatt1 [AND ...] + * SELECT 1 FROM ONLY x WHERE $1 = fkatt1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding PK attributes. * ---------- @@ -1697,7 +1702,7 @@ RI_FKey_restrict_upd(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = pk_type; } - appendStringInfo(&querybuf, " FOR SHARE OF x"); + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 75923a6..fa1e863 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -2863,7 +2863,7 @@ get_select_query_def(Query *query, deparse_context *context, get_rule_expr(query->limitCount, context, false); } - /* Add FOR UPDATE/SHARE clauses if present */ + /* Add FOR UPDATE/SHARE/KEY SHARE clauses if present */ if (query->hasForUpdate) { foreach(l, query->rowMarks) @@ -2875,12 +2875,24 @@ get_select_query_def(Query *query, deparse_context *context, if (rc->pushedDown) continue; - if (rc->forUpdate) - appendContextKeyword(context, " FOR UPDATE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); - else - appendContextKeyword(context, " FOR SHARE", - -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + switch (rc->strength) + { + case LCS_FORKEYSHARE: + appendContextKeyword(context, " FOR KEY SHARE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORSHARE: + appendContextKeyword(context, " FOR SHARE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORUPDATE: + appendContextKeyword(context, " FOR UPDATE", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + default: + elog(ERROR, "unrecognized row locking clause %d", rc->strength); + } + appendStringInfo(buf, " OF %s", quote_identifier(rte->eref->aliasname)); if (rc->noWait) diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 603e4c1..0e8ef6f 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3624,6 +3624,9 @@ RelationGetIndexPredicate(Relation relation) * simple index keys, but attributes used in expressions and partial-index * predicates.) * + * If "keyAttrs" is true, only attributes that can be referenced by foreign + * keys are considered. + * * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that * we can include system attributes (e.g., OID) in the bitmap representation. * @@ -3635,16 +3638,17 @@ RelationGetIndexPredicate(Relation relation) * be bms_free'd when not needed anymore. */ Bitmapset * -RelationGetIndexAttrBitmap(Relation relation) +RelationGetIndexAttrBitmap(Relation relation, bool keyAttrs) { Bitmapset *indexattrs; + Bitmapset *uindexattrs; List *indexoidlist; ListCell *l; MemoryContext oldcxt; /* Quick exit if we already computed the result. */ if (relation->rd_indexattr != NULL) - return bms_copy(relation->rd_indexattr); + return bms_copy(keyAttrs ? relation->rd_keyattr : relation->rd_indexattr); /* Fast path if definitely no indexes */ if (!RelationGetForm(relation)->relhasindex) @@ -3663,26 +3667,38 @@ RelationGetIndexAttrBitmap(Relation relation) * For each index, add referenced attributes to indexattrs. */ indexattrs = NULL; + uindexattrs = NULL; foreach(l, indexoidlist) { Oid indexOid = lfirst_oid(l); Relation indexDesc; IndexInfo *indexInfo; int i; + bool isKey; indexDesc = index_open(indexOid, AccessShareLock); /* Extract index key information from the index's pg_index row */ indexInfo = BuildIndexInfo(indexDesc); + /* Can this index be referenced by a foreign key? */ + isKey = indexInfo->ii_Unique && + indexInfo->ii_Expressions == NIL && + indexInfo->ii_Predicate == NIL; + /* Collect simple attribute references */ for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) { int attrnum = indexInfo->ii_KeyAttrNumbers[i]; if (attrnum != 0) + { indexattrs = bms_add_member(indexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); + if (isKey) + uindexattrs = bms_add_member(uindexattrs, + attrnum - FirstLowInvalidHeapAttributeNumber); + } } /* Collect all attributes used in expressions, too */ @@ -3699,10 +3715,11 @@ RelationGetIndexAttrBitmap(Relation relation) /* Now save a copy of the bitmap in the relcache entry. */ oldcxt = MemoryContextSwitchTo(CacheMemoryContext); relation->rd_indexattr = bms_copy(indexattrs); + relation->rd_keyattr = bms_copy(uindexattrs); MemoryContextSwitchTo(oldcxt); /* We return our original working copy for caller to play with */ - return indexattrs; + return keyAttrs ? uindexattrs : indexattrs; } /* diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c index d9b37b2..560a53d 100644 --- a/src/backend/utils/time/combocid.c +++ b/src/backend/utils/time/combocid.c @@ -118,9 +118,11 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup) { CommandId cid = HeapTupleHeaderGetRawCommandId(tup); + Assert(!(tup->t_infomask & HEAP_MOVED)); /* We do not store cmax when locking a tuple */ - Assert(!(tup->t_infomask & (HEAP_MOVED | HEAP_IS_LOCKED))); - Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tup))); + Assert(!HeapTupleHeaderIsLocked(tup)); + Assert((tup->t_infomask & HEAP_XMAX_IS_MULTI) || + TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tup))); if (tup->t_infomask & HEAP_COMBOCID) return GetRealCmax(cid); diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 1c4b74d..1e2d3fa 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -213,10 +213,23 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer) if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HeapTupleHeaderIsLocked(tuple)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + return true; + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { @@ -249,21 +262,34 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer) if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; return false; /* updated by other */ } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax; + + if (HeapTupleHeaderIsLocked(tuple)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + return true; + if (TransactionIdDidCommit(xmax)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax); + return false; + } return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; return false; } @@ -281,7 +307,7 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer) /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); @@ -389,10 +415,23 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer) if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HeapTupleHeaderIsLocked(tuple)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + return true; + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { @@ -428,21 +467,39 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer) if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; return false; } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax; + + if (HeapTupleHeaderIsLocked(tuple)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false)) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + if (TransactionIdIsInProgress(xmax)) + return true; + if (TransactionIdDidCommit(xmax)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax); + return false; + } return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; if (HeapTupleHeaderGetCmax(tuple) >= GetCurrentCommandId(false)) return true; /* deleted after scan started */ @@ -463,7 +520,7 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer) /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); @@ -636,10 +693,24 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid, if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HeapTupleMayBeUpdated; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HeapTupleHeaderIsLocked(tuple)) /* not deleter */ return HeapTupleMayBeUpdated; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + return HeapTupleMayBeUpdated; + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return HeapTupleMayBeUpdated; + else + return HeapTupleSelfUpdated; + /* FIXME -- what do we need to do with the Cmax here? */ + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { @@ -675,27 +746,49 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid, if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return HeapTupleMayBeUpdated; + /* XXX might have XMAX_IS_MULTI ... */ return HeapTupleUpdated; /* updated by other */ } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax; + + if (HeapTupleHeaderIsLocked(tuple)) + { + if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple))) + return HeapTupleBeingUpdated; + return HeapTupleMayBeUpdated; + } + + xmax = HeapTupleGetUpdateXid(tuple); + + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= curcid) + return HeapTupleSelfUpdated; /* updated after scan started */ + else + return HeapTupleInvisible; /* updated before scan started */ + } if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple))) return HeapTupleBeingUpdated; - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); + + if (TransactionIdDidCommit(xmax)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax); + return HeapTupleUpdated; + } + /* it must have aborted or crashed */ return HeapTupleMayBeUpdated; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) - return HeapTupleMayBeUpdated; + if (HeapTupleHeaderIsLocked(tuple)) + return HeapTupleMayBeUpdated; /* FIXME might need rethinking */ if (HeapTupleHeaderGetCmax(tuple) >= curcid) return HeapTupleSelfUpdated; /* updated after scan started */ else @@ -715,7 +808,7 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid, /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); @@ -802,10 +895,23 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot, if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HeapTupleHeaderIsLocked(tuple)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + return true; + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else + return false; + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { @@ -842,21 +948,37 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot, if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; return false; /* updated by other */ } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax; + + if (HeapTupleHeaderIsLocked(tuple)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) + { + snapshot->xmax = xmax; + return true; + } + if (TransactionIdDidCommit(xmax)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax); + return false; + } return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; return false; } @@ -877,7 +999,7 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot, /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); @@ -966,10 +1088,25 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot, if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HeapTupleHeaderIsLocked(tuple)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + TransactionId xmax; + + xmax = HeapTupleGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) + return true; + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* updated after scan started */ + else + return false; /* updated before scan started */ + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { @@ -1008,13 +1145,34 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot, if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return true; if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax; + + if (HeapTupleHeaderIsLocked(tuple)) + return true; + + xmax = HeapTupleGetUpdateXid(tuple); + if (TransactionIdIsCurrentTransactionId(xmax)) + { + if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + if (TransactionIdIsInProgress(xmax)) + return true; + if (TransactionIdDidCommit(xmax)) + { + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax); + /* updating transaction committed, but when? */ + if (XidInMVCCSnapshot(xmax, snapshot)) + return true; /* treat as still in progress */ + return false; + } return true; } @@ -1121,8 +1279,9 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin, { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HEAPTUPLE_INSERT_IN_PROGRESS; - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) return HEAPTUPLE_INSERT_IN_PROGRESS; + /* FIXME -- probably need something here */ /* inserted and then deleted by same xact */ return HEAPTUPLE_DELETE_IN_PROGRESS; } @@ -1153,7 +1312,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin, if (tuple->t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE; - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HeapTupleHeaderIsLocked(tuple)) { /* * "Deleting" xact really only locked it, so the tuple is live in any @@ -1177,6 +1336,10 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin, } /* + * FIXME -- if the multixact is gone, we should replace it with + * the plain updating Xid and remove the IS_MULTI bit. + */ + /* * We don't really care whether xmax did commit, abort or crash. * We know that xmax did lock the tuple, but it did not and will * never actually update it. @@ -1184,14 +1347,44 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin, SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); } + return HEAPTUPLE_LIVE; } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); - return HEAPTUPLE_LIVE; + TransactionId xmax; + + if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple))) + return HEAPTUPLE_LIVE; + + xmax = HeapTupleGetUpdateXid(tuple); + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + Assert(!TransactionIdIsInProgress(xmax)); + Assert(!TransactionIdIsCurrentTransactionId(xmax)); + if (TransactionIdDidCommit(xmax)) + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xmax); + else + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HEAPTUPLE_LIVE; + } + } + + /* + * Deleter committed, but perhaps it was recent enough that some open + * transactions could still see the tuple. + */ + if (!TransactionIdPrecedes(xmax, OldestXmin)) + return HEAPTUPLE_RECENTLY_DEAD; + + /* Otherwise, it's dead and removable */ + return HEAPTUPLE_DEAD; } if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index 5b8ae88..4fdd6de 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -87,6 +87,7 @@ main(int argc, char *argv[]) Oid set_oid = 0; MultiXactId set_mxid = 0; MultiXactOffset set_mxoff = (MultiXactOffset) -1; + TransactionId set_mxfreeze = FrozenTransactionId; uint32 minXlogTli = 0, minXlogId = 0, minXlogSeg = 0; @@ -116,7 +117,7 @@ main(int argc, char *argv[]) } - while ((c = getopt(argc, argv, "fl:m:no:O:x:e:")) != -1) + while ((c = getopt(argc, argv, "fl:m:no:O:x:e:z:")) != -1) { switch (c) { @@ -203,6 +204,23 @@ main(int argc, char *argv[]) } break; + case 'z': + set_mxfreeze = strtoul(optarg, &endptr, 0); + if (endptr == optarg || *endptr != '\0') + { + fprintf(stderr, _("%s: invalid argument for option -z\n"), progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + /* InvalidTransactionId is allowed here */ + if (set_mxfreeze == FrozenTransactionId || + set_mxfreeze == BootstrapTransactionId) + { + fprintf(stderr, _("%s: multitransaction freezeXid (-z) must not be 1 or 2\n"), progname); + exit(1); + } + break; + case 'l': minXlogTli = strtoul(optarg, &endptr, 0); if (endptr == optarg || *endptr != ',') @@ -332,6 +350,11 @@ main(int argc, char *argv[]) if (set_mxoff != -1) ControlFile.checkPointCopy.nextMultiOffset = set_mxoff; + /* + if (set_mxfreeze != -1) + ControlFile.checkPointCopy.mxactFreezeXid = set_mxfreeze; + */ + if (minXlogTli > ControlFile.checkPointCopy.ThisTimeLineID) ControlFile.checkPointCopy.ThisTimeLineID = minXlogTli; @@ -578,6 +601,10 @@ PrintControlValues(bool guessed) ControlFile.checkPointCopy.nextMulti); printf(_("Latest checkpoint's NextMultiOffset: %u\n"), ControlFile.checkPointCopy.nextMultiOffset); + /* + printf(_("Latest checkpoint's MultiXact freezeXid: %u\n"), + ControlFile.checkPointCopy.mxactFreezeXid); + */ printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); printf(_("Latest checkpoint's oldestXID's DB: %u\n"), diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 776ea5c..363c3bd 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -29,10 +29,22 @@ typedef struct BulkInsertStateData *BulkInsertState; +/* + * Possible lock modes for a tuple. + */ typedef enum { - LockTupleShared, - LockTupleExclusive + /* SELECT FOR KEY SHARE */ + LockTupleKeyShare, + /* SELECT FOR SHARE */ + LockTupleShare, + /* + * SELECT FOR UPDATE, and also plain UPDATE when the "key" columns are + * not modified + */ + LockTupleUpdate, + /* other UPDATEs, and DELETE */ + LockTupleKeyUpdate } LockTupleMode; diff --git a/src/include/access/htup.h b/src/include/access/htup.h index 966e2d0..c84cd7f 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -164,12 +164,15 @@ typedef HeapTupleHeaderData *HeapTupleHeader; #define HEAP_HASVARWIDTH 0x0002 /* has variable-width attribute(s) */ #define HEAP_HASEXTERNAL 0x0004 /* has external stored attribute(s) */ #define HEAP_HASOID 0x0008 /* has an object-id field */ -/* bit 0x0010 is available */ +#define HEAP_XMAX_KEYSHR_LOCK 0x0010 /* xmax is a key-shared locker */ #define HEAP_COMBOCID 0x0020 /* t_cid is a combo cid */ #define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */ -#define HEAP_XMAX_SHARED_LOCK 0x0080 /* xmax is shared locker */ -/* if either LOCK bit is set, xmax hasn't deleted the tuple, only locked it */ -#define HEAP_IS_LOCKED (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK) +#define HEAP_XMAX_IS_NOT_UPDATE 0x0080 /* xmax, if valid, is only a locker. + * Note this is not set unless + * XMAX_IS_MULTI is also set. */ + +#define HEAP_LOCK_BITS (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_IS_NOT_UPDATE | \ + HEAP_XMAX_KEYSHR_LOCK) #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ #define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */ #define HEAP_XMAX_COMMITTED 0x0400 /* t_xmax committed */ @@ -187,14 +190,30 @@ typedef HeapTupleHeaderData *HeapTupleHeader; #define HEAP_XACT_MASK 0xFFE0 /* visibility-related bits */ /* + * A tuple is only locked (i.e. not updated by its Xmax) if it the Xmax is not + * a multixact and it has either the EXCL_LOCK or KEYSHR_LOCK bits set, or if + * the xmax is a multi that doesn't contain an update. + * + * Beware of multiple evaluation of arguments. + */ +#define HeapTupleHeaderInfomaskIsLocked(infomask) \ + ((!((infomask) & HEAP_XMAX_IS_MULTI) && \ + (infomask) & (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)) || \ + (((infomask) & HEAP_XMAX_IS_MULTI) && ((infomask) & HEAP_XMAX_IS_NOT_UPDATE))) + +#define HeapTupleHeaderIsLocked(tup) \ + HeapTupleHeaderInfomaskIsLocked((tup)->t_infomask) + +/* * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x3800 are available */ +/* bits 0x1800 are available */ +#define HEAP_UPDATE_KEY_INTACT 0x2000 /* tuple updated, key cols untouched */ #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xC000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xE000 /* visibility-related bits */ /* * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins. It is @@ -221,6 +240,23 @@ typedef HeapTupleHeaderData *HeapTupleHeader; (tup)->t_choice.t_heap.t_xmin = (xid) \ ) +/* + * HeapTupleHeaderGetXmax gets you the raw Xmax field. To find out the Xid + * that updated a tuple, you might need to resolve the MultiXactId if certain + * bits are set. HeapTupleHeaderGetUpdateXid checks those bits and takes care + * to resolve the MultiXactId if necessary. This might involve multixact I/O, + * so it should only be used if absolutely necessary. + */ +#define HeapTupleHeaderGetUpdateXid(tup) \ +( \ + (!((tup)->t_infomask & HEAP_XMAX_INVALID) && \ + ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \ + !((tup)->t_infomask & HEAP_XMAX_IS_NOT_UPDATE)) ? \ + HeapTupleGetUpdateXid(tup) \ + : \ + HeapTupleHeaderGetXmax(tup) \ +) + #define HeapTupleHeaderGetXmax(tup) \ ( \ (tup)->t_choice.t_heap.t_xmax \ @@ -721,16 +757,22 @@ typedef struct xl_heap_newpage #define SizeOfHeapNewpage (offsetof(xl_heap_newpage, blkno) + sizeof(BlockNumber)) +/* flags for xl_heap_lock.infobits_set */ +#define XLHL_XMAX_IS_MULTI 0x01 +#define XLHL_XMAX_IS_NOT_UPDATE 0x02 +#define XLHL_XMAX_EXCL_LOCK 0x04 +#define XLHL_XMAX_KEYSHR_LOCK 0x08 +#define XLHL_UPDATE_KEY_INTACT 0x10 + /* This is what we need to know about lock */ typedef struct xl_heap_lock { xl_heaptid target; /* locked tuple id */ TransactionId locking_xid; /* might be a MultiXactId not xid */ - bool xid_is_mxact; /* is it? */ - bool shared_lock; /* shared or exclusive row lock? */ + int8 infobits_set; /* infomask and infomask2 bits to set */ } xl_heap_lock; -#define SizeOfHeapLock (offsetof(xl_heap_lock, shared_lock) + sizeof(bool)) +#define SizeOfHeapLock (offsetof(xl_heap_lock, infobits_set) + sizeof(int8)) /* This is what we need to know about in-place update */ typedef struct xl_heap_inplace @@ -768,8 +810,7 @@ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup); extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup); extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, - CommandId *cmax, - bool *iscombo); + CommandId *cmax, bool *iscombo); /* ---------------- * fastgetattr @@ -854,6 +895,9 @@ extern Datum fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, heap_getsysattr((tup), (attnum), (tupleDesc), (isnull)) \ ) +/* Prototype for HeapTupleHeader accessor in heapam.c */ +extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple); + /* prototypes for functions in common/heaptuple.c */ extern Size heap_compute_data_size(TupleDesc tupleDesc, Datum *values, bool *isnull); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index c3ec763..ff255d7 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -13,8 +13,14 @@ #include "access/xlog.h" + +/* + * The first two MultiXactId values are reserved to store the truncation Xid + * and epoch of the first segment, so we start assigning multixact values from + * 2. + */ #define InvalidMultiXactId ((MultiXactId) 0) -#define FirstMultiXactId ((MultiXactId) 1) +#define FirstMultiXactId ((MultiXactId) 2) #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) @@ -22,6 +28,31 @@ #define NUM_MXACTOFFSET_BUFFERS 8 #define NUM_MXACTMEMBER_BUFFERS 16 +/* + * Possible multixact lock modes ("status"). The first three modes are for + * tuple locks (FOR KEY SHARE, FOR SHARE and FOR UPDATE, respectively); the + * fourth is used for an update that doesn't modify key columns. The fifth one + * is used for other updates and deletes. Note that we only use two bits to + * represent them on disk, which means we don't have space to represent the + * last one. This is okay, because a multixact can never contain such an + * operation; this mode is only used to wait for other modes. + */ +typedef enum +{ + MultiXactStatusForKeyShare = 0x00, + MultiXactStatusForShare = 0x01, + MultiXactStatusForUpdate = 0x02, + MultiXactStatusUpdate = 0x03, + MultiXactStatusKeyUpdate = 0x04, +} MultiXactStatus; + +typedef struct MultiXactMember +{ + TransactionId xid; + MultiXactStatus status; +} MultiXactMember; + + /* ---------------- * multixact-related XLOG entries * ---------------- @@ -35,21 +66,27 @@ typedef struct xl_multixact_create { MultiXactId mid; /* new MultiXact's ID */ MultiXactOffset moff; /* its starting offset in members file */ - int32 nxids; /* number of member XIDs */ - TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */ + int32 nmembers; /* number of member XIDs */ + MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; } xl_multixact_create; -#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, xids) +#define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, members) -extern MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2); -extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid); +extern MultiXactId MultiXactIdCreateSingleton(TransactionId xid, + MultiXactStatus status); +extern MultiXactId MultiXactIdCreate(TransactionId xid1, + MultiXactStatus status1, TransactionId xid2, + MultiXactStatus status2); +extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, + MultiXactStatus status); extern bool MultiXactIdIsRunning(MultiXactId multi); -extern bool MultiXactIdIsCurrent(MultiXactId multi); -extern void MultiXactIdWait(MultiXactId multi); -extern bool ConditionalMultiXactIdWait(MultiXactId multi); +extern void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, + int *remaining); +extern bool ConditionalMultiXactIdWait(MultiXactId multi, + MultiXactStatus status, int *remaining); extern void MultiXactIdSetOldestMember(void); -extern int GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids); +extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **xids); extern void AtEOXact_MultiXact(void); extern void AtPrepare_MultiXact(void); @@ -62,8 +99,12 @@ extern void StartupMultiXact(void); extern void ShutdownMultiXact(void); extern void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId *nextMulti, - MultiXactOffset *nextMultiOffset); + MultiXactOffset *nextMultiOffset, + TransactionId *oldestTruncateXid, + uint32 *oldestTruncateXidEpoch, + MultiXactId *oldestMulti); extern void CheckPointMultiXact(void); +extern void TruncateMultiXact(TransactionId oldestXid); extern void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset); extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index cb43879..2e73233 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -71,7 +71,7 @@ typedef struct XLogContRecord /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD068 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD069 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 6688c19..f82295a 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -38,6 +38,10 @@ typedef struct CheckPoint Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ + TransactionId oldestSegTruncateXid; /* truncate xid of oldest multixact + * offset segment */ + uint32 oldestSegTruncateXidEpoch; /* epoch of above xid */ + MultiXactId oldestMultiXactId; /* oldest MultiXactId still on disk */ TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ Oid oldestXidDB; /* database with minimum datfrozenxid */ pg_time_t time; /* time stamp of checkpoint */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0a89f18..5167f09 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -404,9 +404,9 @@ typedef struct EState /* * ExecRowMark - - * runtime representation of FOR UPDATE/SHARE clauses + * runtime representation of FOR UPDATE/SHARE/KEY SHARE clauses * - * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we should have an + * When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE/KEY SHARE, we should have an * ExecRowMark for each non-target relation in the query (except inheritance * parent RTEs, which can be ignored at runtime). See PlanRowMark for details * about most of the fields. In addition to fields directly derived from @@ -427,7 +427,7 @@ typedef struct ExecRowMark /* * ExecAuxRowMark - - * additional runtime representation of FOR UPDATE/SHARE clauses + * additional runtime representation of FOR UPDATE/SHARE/KEY SHARE clauses * * Each LockRows and ModifyTable node keeps a list of the rowmarks it needs to * deal with. In addition to a pointer to the related entry in es_rowMarks, @@ -1815,7 +1815,7 @@ typedef struct SetOpState /* ---------------- * LockRowsState information * - * LockRows nodes are used to enforce FOR UPDATE/FOR SHARE locking. + * LockRows nodes are used to enforce FOR UPDATE/SHARE/KEY SHARE locking. * ---------------- */ typedef struct LockRowsState diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index af6565e..1dc6202 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -74,7 +74,7 @@ typedef uint32 AclMode; /* a bitmask of privilege bits */ #define ACL_CONNECT (1<<11) /* for databases */ #define N_ACL_RIGHTS 12 /* 1 plus the last 1< +step upd: UPDATE foo SET b = 'Hello World'; step com: COMMIT; -step upd: <... completed> starting permutation: upd ins com step upd: UPDATE foo SET b = 'Hello World'; diff --git a/src/test/isolation/expected/fk-deadlock.out b/src/test/isolation/expected/fk-deadlock.out index 36813f1..69a294a 100644 --- a/src/test/isolation/expected/fk-deadlock.out +++ b/src/test/isolation/expected/fk-deadlock.out @@ -11,57 +11,51 @@ step s2c: COMMIT; starting permutation: s1i s1u s2i s1c s2u s2c step s1i: INSERT INTO child VALUES (1, 1); step s1u: UPDATE parent SET aux = 'bar'; -step s2i: INSERT INTO child VALUES (2, 1); +step s2i: INSERT INTO child VALUES (2, 1); step s1c: COMMIT; -step s2i: <... completed> step s2u: UPDATE parent SET aux = 'baz'; step s2c: COMMIT; starting permutation: s1i s2i s1u s2u s1c s2c step s1i: INSERT INTO child VALUES (1, 1); step s2i: INSERT INTO child VALUES (2, 1); -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: <... completed> -error in steps s2u s1u: ERROR: deadlock detected +step s1u: UPDATE parent SET aux = 'bar'; +step s2u: UPDATE parent SET aux = 'baz'; step s1c: COMMIT; +step s2u: <... completed> step s2c: COMMIT; starting permutation: s1i s2i s2u s1u s2c s1c step s1i: INSERT INTO child VALUES (1, 1); step s2i: INSERT INTO child VALUES (2, 1); -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: <... completed> -error in steps s1u s2u: ERROR: deadlock detected +step s2u: UPDATE parent SET aux = 'baz'; +step s1u: UPDATE parent SET aux = 'bar'; step s2c: COMMIT; +step s1u: <... completed> step s1c: COMMIT; starting permutation: s2i s1i s1u s2u s1c s2c step s2i: INSERT INTO child VALUES (2, 1); step s1i: INSERT INTO child VALUES (1, 1); -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: <... completed> -error in steps s2u s1u: ERROR: deadlock detected +step s1u: UPDATE parent SET aux = 'bar'; +step s2u: UPDATE parent SET aux = 'baz'; step s1c: COMMIT; +step s2u: <... completed> step s2c: COMMIT; starting permutation: s2i s1i s2u s1u s2c s1c step s2i: INSERT INTO child VALUES (2, 1); step s1i: INSERT INTO child VALUES (1, 1); -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: <... completed> -error in steps s1u s2u: ERROR: deadlock detected +step s2u: UPDATE parent SET aux = 'baz'; +step s1u: UPDATE parent SET aux = 'bar'; step s2c: COMMIT; +step s1u: <... completed> step s1c: COMMIT; starting permutation: s2i s2u s1i s2c s1u s1c step s2i: INSERT INTO child VALUES (2, 1); step s2u: UPDATE parent SET aux = 'baz'; -step s1i: INSERT INTO child VALUES (1, 1); +step s1i: INSERT INTO child VALUES (1, 1); step s2c: COMMIT; -step s1i: <... completed> step s1u: UPDATE parent SET aux = 'bar'; step s1c: COMMIT; diff --git a/src/test/isolation/expected/fk-deadlock2.out b/src/test/isolation/expected/fk-deadlock2.out index 2d8e5e5..41a818d 100644 --- a/src/test/isolation/expected/fk-deadlock2.out +++ b/src/test/isolation/expected/fk-deadlock2.out @@ -17,91 +17,79 @@ step s2u1: <... completed> step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; -starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c +starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected step s1c: COMMIT; -step s2c: COMMIT; -starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c +starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; +step s1u2: <... completed> step s1c: COMMIT; -starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c +starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected -step s1c: COMMIT; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; - -starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c -step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected -step s2c: COMMIT; step s1c: COMMIT; -starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c +starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected step s1c: COMMIT; -step s2c: COMMIT; -starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c +starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; +step s1u2: <... completed> step s1c: COMMIT; -starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c +starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected step s1c: COMMIT; -step s2c: COMMIT; -starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c +starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; +step s1u2: <... completed> step s1c: COMMIT; starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2c: COMMIT; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1c: COMMIT; + +starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; -step s1u1: <... completed> +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1c: COMMIT; diff --git a/src/test/isolation/expected/fk-deadlock2_1.out b/src/test/isolation/expected/fk-deadlock2_1.out index 30c4c99..3827348 100644 --- a/src/test/isolation/expected/fk-deadlock2_1.out +++ b/src/test/isolation/expected/fk-deadlock2_1.out @@ -19,92 +19,87 @@ step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; ERROR: current transaction is aborted, commands ignored until end of transaction block step s2c: COMMIT; -starting permutation: s1u1 s2u1 s1u2 s2u2 s1c s2c +starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; -step s2c: COMMIT; -starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c +starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; -starting permutation: s1u1 s2u1 s2u2 s1u2 s1c s2c +starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected -step s1c: COMMIT; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; - -starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c -step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected -step s2c: COMMIT; +ERROR: could not serialize access due to read/write dependencies among transactions step s1c: COMMIT; -starting permutation: s2u1 s1u1 s1u2 s2u2 s1c s2c +starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; -step s2c: COMMIT; -starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c +starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: <... completed> -error in steps s2u2 s1u2: ERROR: deadlock detected +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; -starting permutation: s2u1 s1u1 s2u2 s1u2 s1c s2c +starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected +ERROR: could not serialize access due to read/write dependencies among transactions step s1c: COMMIT; -step s2c: COMMIT; -starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c +starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; -step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s2u2: <... completed> -error in steps s1u2 s2u2: ERROR: deadlock detected +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; -step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; step s2c: COMMIT; -step s1u1: <... completed> step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; ERROR: could not serialize access due to read/write dependencies among transactions step s1c: COMMIT; + +starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1c: COMMIT; diff --git a/src/test/isolation/expected/fk-deadlock2_2.out b/src/test/isolation/expected/fk-deadlock2_2.out new file mode 100644 index 0000000..b6be4b9 --- /dev/null +++ b/src/test/isolation/expected/fk-deadlock2_2.out @@ -0,0 +1,105 @@ +Parsed test spec with 2 sessions + +starting permutation: s1u1 s1u2 s1c s2u1 s2u2 s2c +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1c: COMMIT; +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; + +starting permutation: s1u1 s1u2 s2u1 s1c s2u2 s2c +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1c: COMMIT; +step s2u1: <... completed> +error in steps s1c s2u1: ERROR: could not serialize access due to concurrent update +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +ERROR: current transaction is aborted, commands ignored until end of transaction block +step s2c: COMMIT; + +starting permutation: s1u1 s2u1 s1u2 s2u2 s2c s1c +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s1u1 s2u1 s2u2 s1u2 s2c s1c +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s1u1 s2u1 s2u2 s2c s1u2 s1c +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2u1 s1u1 s1u2 s2u2 s2c s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2u1 s1u1 s2u2 s1u2 s2c s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2u1 s1u1 s2u2 s2c s1u2 s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2u1 s2u2 s1u1 s1u2 s2c s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u2: <... completed> +error in steps s2c s1u2: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2u1 s2u2 s1u1 s2c s1u2 s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s2c: COMMIT; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2u1 s2u2 s2c s1u1 s1u2 s1c +step s2u1: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s2c: COMMIT; +step s1u1: UPDATE A SET Col1 = 1 WHERE AID = 1; +step s1u2: UPDATE B SET Col2 = 1 WHERE BID = 2; +step s1c: COMMIT; diff --git a/src/test/isolation/expected/fk-deadlock3.out b/src/test/isolation/expected/fk-deadlock3.out new file mode 100644 index 0000000..e69de29 diff --git a/src/test/isolation/expected/fk-deadlock_1.out b/src/test/isolation/expected/fk-deadlock_1.out index ca75322..d648e48 100644 --- a/src/test/isolation/expected/fk-deadlock_1.out +++ b/src/test/isolation/expected/fk-deadlock_1.out @@ -11,61 +11,57 @@ step s2c: COMMIT; starting permutation: s1i s1u s2i s1c s2u s2c step s1i: INSERT INTO child VALUES (1, 1); step s1u: UPDATE parent SET aux = 'bar'; -step s2i: INSERT INTO child VALUES (2, 1); +step s2i: INSERT INTO child VALUES (2, 1); step s1c: COMMIT; -step s2i: <... completed> -error in steps s1c s2i: ERROR: could not serialize access due to concurrent update step s2u: UPDATE parent SET aux = 'baz'; -ERROR: current transaction is aborted, commands ignored until end of transaction block +ERROR: could not serialize access due to read/write dependencies among transactions step s2c: COMMIT; starting permutation: s1i s2i s1u s2u s1c s2c step s1i: INSERT INTO child VALUES (1, 1); step s2i: INSERT INTO child VALUES (2, 1); -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: <... completed> -error in steps s2u s1u: ERROR: deadlock detected +step s1u: UPDATE parent SET aux = 'bar'; +step s2u: UPDATE parent SET aux = 'baz'; step s1c: COMMIT; +step s2u: <... completed> +error in steps s1c s2u: ERROR: could not serialize access due to concurrent update step s2c: COMMIT; starting permutation: s1i s2i s2u s1u s2c s1c step s1i: INSERT INTO child VALUES (1, 1); step s2i: INSERT INTO child VALUES (2, 1); -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: <... completed> -error in steps s1u s2u: ERROR: deadlock detected +step s2u: UPDATE parent SET aux = 'baz'; +step s1u: UPDATE parent SET aux = 'bar'; step s2c: COMMIT; +step s1u: <... completed> +error in steps s2c s1u: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; starting permutation: s2i s1i s1u s2u s1c s2c step s2i: INSERT INTO child VALUES (2, 1); step s1i: INSERT INTO child VALUES (1, 1); -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: <... completed> -error in steps s2u s1u: ERROR: deadlock detected +step s1u: UPDATE parent SET aux = 'bar'; +step s2u: UPDATE parent SET aux = 'baz'; step s1c: COMMIT; +step s2u: <... completed> +error in steps s1c s2u: ERROR: could not serialize access due to concurrent update step s2c: COMMIT; starting permutation: s2i s1i s2u s1u s2c s1c step s2i: INSERT INTO child VALUES (2, 1); step s1i: INSERT INTO child VALUES (1, 1); -step s2u: UPDATE parent SET aux = 'baz'; -step s1u: UPDATE parent SET aux = 'bar'; -step s2u: <... completed> -error in steps s1u s2u: ERROR: deadlock detected +step s2u: UPDATE parent SET aux = 'baz'; +step s1u: UPDATE parent SET aux = 'bar'; step s2c: COMMIT; +step s1u: <... completed> +error in steps s2c s1u: ERROR: could not serialize access due to concurrent update step s1c: COMMIT; starting permutation: s2i s2u s1i s2c s1u s1c step s2i: INSERT INTO child VALUES (2, 1); step s2u: UPDATE parent SET aux = 'baz'; -step s1i: INSERT INTO child VALUES (1, 1); +step s1i: INSERT INTO child VALUES (1, 1); step s2c: COMMIT; -step s1i: <... completed> -error in steps s2c s1i: ERROR: could not serialize access due to concurrent update step s1u: UPDATE parent SET aux = 'bar'; -ERROR: current transaction is aborted, commands ignored until end of transaction block +ERROR: could not serialize access due to read/write dependencies among transactions step s1c: COMMIT; diff --git a/src/test/isolation/expected/fk-deadlock_2.out b/src/test/isolation/expected/fk-deadlock_2.out new file mode 100644 index 0000000..2d3294e --- /dev/null +++ b/src/test/isolation/expected/fk-deadlock_2.out @@ -0,0 +1,65 @@ +Parsed test spec with 2 sessions + +starting permutation: s1i s1u s1c s2i s2u s2c +step s1i: INSERT INTO child VALUES (1, 1); +step s1u: UPDATE parent SET aux = 'bar'; +step s1c: COMMIT; +step s2i: INSERT INTO child VALUES (2, 1); +step s2u: UPDATE parent SET aux = 'baz'; +step s2c: COMMIT; + +starting permutation: s1i s1u s2i s1c s2u s2c +step s1i: INSERT INTO child VALUES (1, 1); +step s1u: UPDATE parent SET aux = 'bar'; +step s2i: INSERT INTO child VALUES (2, 1); +step s1c: COMMIT; +step s2u: UPDATE parent SET aux = 'baz'; +step s2c: COMMIT; + +starting permutation: s1i s2i s1u s2u s1c s2c +step s1i: INSERT INTO child VALUES (1, 1); +step s2i: INSERT INTO child VALUES (2, 1); +step s1u: UPDATE parent SET aux = 'bar'; +step s2u: UPDATE parent SET aux = 'baz'; +step s1c: COMMIT; +step s2u: <... completed> +error in steps s1c s2u: ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1i s2i s2u s1u s2c s1c +step s1i: INSERT INTO child VALUES (1, 1); +step s2i: INSERT INTO child VALUES (2, 1); +step s2u: UPDATE parent SET aux = 'baz'; +step s1u: UPDATE parent SET aux = 'bar'; +step s2c: COMMIT; +step s1u: <... completed> +error in steps s2c s1u: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2i s1i s1u s2u s1c s2c +step s2i: INSERT INTO child VALUES (2, 1); +step s1i: INSERT INTO child VALUES (1, 1); +step s1u: UPDATE parent SET aux = 'bar'; +step s2u: UPDATE parent SET aux = 'baz'; +step s1c: COMMIT; +step s2u: <... completed> +error in steps s1c s2u: ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s2i s1i s2u s1u s2c s1c +step s2i: INSERT INTO child VALUES (2, 1); +step s1i: INSERT INTO child VALUES (1, 1); +step s2u: UPDATE parent SET aux = 'baz'; +step s1u: UPDATE parent SET aux = 'bar'; +step s2c: COMMIT; +step s1u: <... completed> +error in steps s2c s1u: ERROR: could not serialize access due to concurrent update +step s1c: COMMIT; + +starting permutation: s2i s2u s1i s2c s1u s1c +step s2i: INSERT INTO child VALUES (2, 1); +step s2u: UPDATE parent SET aux = 'baz'; +step s1i: INSERT INTO child VALUES (1, 1); +step s2c: COMMIT; +step s1u: UPDATE parent SET aux = 'bar'; +step s1c: COMMIT; diff --git a/src/test/isolation/specs/fk-deadlock2.spec b/src/test/isolation/specs/fk-deadlock2.spec index a8f1516..eefe187 100644 --- a/src/test/isolation/specs/fk-deadlock2.spec +++ b/src/test/isolation/specs/fk-deadlock2.spec @@ -42,18 +42,18 @@ permutation "s1u1" "s1u2" "s2u1" "s1c" "s2u2" "s2c" #permutation "s1u1" "s1u2" "s2u1" "s2u2" "s1c" "s2c" #permutation "s1u1" "s1u2" "s2u1" "s2u2" "s2c" "s1c" #permutation "s1u1" "s2u1" "s1u2" "s1c" "s2u2" "s2c" -permutation "s1u1" "s2u1" "s1u2" "s2u2" "s1c" "s2c" +#permutation "s1u1" "s2u1" "s1u2" "s2u2" "s1c" "s2c" permutation "s1u1" "s2u1" "s1u2" "s2u2" "s2c" "s1c" -permutation "s1u1" "s2u1" "s2u2" "s1u2" "s1c" "s2c" +#permutation "s1u1" "s2u1" "s2u2" "s1u2" "s1c" "s2c" permutation "s1u1" "s2u1" "s2u2" "s1u2" "s2c" "s1c" -#permutation "s1u1" "s2u1" "s2u2" "s2c" "s1u2" "s1c" +permutation "s1u1" "s2u1" "s2u2" "s2c" "s1u2" "s1c" #permutation "s2u1" "s1u1" "s1u2" "s1c" "s2u2" "s2c" -permutation "s2u1" "s1u1" "s1u2" "s2u2" "s1c" "s2c" +#permutation "s2u1" "s1u1" "s1u2" "s2u2" "s1c" "s2c" permutation "s2u1" "s1u1" "s1u2" "s2u2" "s2c" "s1c" -permutation "s2u1" "s1u1" "s2u2" "s1u2" "s1c" "s2c" +#permutation "s2u1" "s1u1" "s2u2" "s1u2" "s1c" "s2c" permutation "s2u1" "s1u1" "s2u2" "s1u2" "s2c" "s1c" -#permutation "s2u1" "s1u1" "s2u2" "s2c" "s1u2" "s1c" +permutation "s2u1" "s1u1" "s2u2" "s2c" "s1u2" "s1c" #permutation "s2u1" "s2u2" "s1u1" "s1u2" "s1c" "s2c" -#permutation "s2u1" "s2u2" "s1u1" "s1u2" "s2c" "s1c" +permutation "s2u1" "s2u2" "s1u1" "s1u2" "s2c" "s1c" permutation "s2u1" "s2u2" "s1u1" "s2c" "s1u2" "s1c" -#permutation "s2u1" "s2u2" "s2c" "s1u1" "s1u2" "s1c" +permutation "s2u1" "s2u2" "s2c" "s1u1" "s1u2" "s1c"