POC: Lock updated tuples in tuple_update() and tuple_delete()
Hackers,
When working in the read committed transaction isolation mode
(default), we have the following sequence of actions when
tuple_update() or tuple_delete() find concurrently updated tuple.
1. tuple_update()/tuple_delete() returns TM_Updated
2. tuple_lock()
3. Re-evaluate plan qual (recheck if we still need to update/delete
and calculate the new tuple for update)
4. tuple_update()/tuple_delete() (this time should be successful,
since we've previously locked the tuple).
I wonder if we should merge steps 1 and 2. We could save some efforts
already done during tuple_update()/tuple_delete() for locking the
tuple. In heap table access method, we've to start tuple_lock() with
the first tuple in the chain, but tuple_update()/tuple_delete()
already visited it. For undo-based table access methods,
tuple_update()/tuple_delete() should start from the last version, why
don't place the tuple lock immediately once a concurrent update is
detected. I think this patch should have some performance benefits on
high concurrency.
Also, the patch simplifies code in nodeModifyTable.c getting rid of
the nested case. I also get rid of extra
table_tuple_fetch_row_version() in ExecUpdate. Why re-fetch the old
tuple, when it should be exactly the same tuple we've just locked.
I'm going to check the performance impact. Thoughts and feedback are welcome.
------
Regards,
Alexander Korotkov
Attachments:
0001-Lock-updated-tuples-in-tuple_update-and-tuple_del-v1.patchapplication/octet-stream; name=0001-Lock-updated-tuples-in-tuple_update-and-tuple_del-v1.patchDownload
From 7bb1e45353f5b81a9963cda843b1aee0f65b05ad Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Thu, 30 Jun 2022 22:07:12 +0300
Subject: [PATCH] Lock updated tuples in tuple_update() and tuple_delete()
---
src/backend/access/heap/heapam_handler.c | 99 ++++++++--
src/backend/access/table/tableam.c | 6 +-
src/backend/executor/nodeModifyTable.c | 227 ++++++++---------------
src/include/access/tableam.h | 20 +-
4 files changed, 181 insertions(+), 171 deletions(-)
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 444f027149c..ffe78bb6b25 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -45,6 +45,12 @@
#include "utils/builtins.h"
#include "utils/rel.h"
+static TM_Result heapam_tuple_lock_internal(Relation relation, ItemPointer tid,
+ Snapshot snapshot, TupleTableSlot *slot,
+ CommandId cid, LockTupleMode mode,
+ LockWaitPolicy wait_policy, uint8 flags,
+ TM_FailureData *tmfd, bool updated);
+
static void reform_and_rewrite_tuple(HeapTuple tuple,
Relation OldHeap, Relation NewHeap,
Datum *values, bool *isnull, RewriteState rwstate);
@@ -299,14 +305,38 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
static TM_Result
heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
Snapshot snapshot, Snapshot crosscheck, bool wait,
- TM_FailureData *tmfd, bool changingPart)
+ TM_FailureData *tmfd, bool changingPart,
+ bool lockUpdated, TupleTableSlot *lockedSlot)
{
+ TM_Result result;
+
/*
* Currently Deleting of index tuples are handled at vacuum, in case if
* the storage itself is cleaning the dead tuples by itself, it is the
* time to call the index tuple deletion also.
*/
- return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
+ result = heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
+
+ if (result == TM_Updated && lockUpdated)
+ {
+ bool updated = false;
+
+ if (!ItemPointerIndicatesMovedPartitions(&tmfd->ctid))
+ updated = true;
+ result = heapam_tuple_lock_internal(relation, tid, snapshot,
+ lockedSlot, cid, LockTupleExclusive,
+ LockWaitBlock,
+ TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
+ tmfd, updated);
+
+ if (result == TM_Ok)
+ {
+ tmfd->traversed = true;
+ return TM_Updated;
+ }
+ }
+
+ return result;
}
@@ -314,7 +344,8 @@ static TM_Result
heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
CommandId cid, Snapshot snapshot, Snapshot crosscheck,
bool wait, TM_FailureData *tmfd,
- LockTupleMode *lockmode, bool *update_indexes)
+ LockTupleMode *lockmode, bool *update_indexes,
+ bool lockUpdated, TupleTableSlot *lockedSlot)
{
bool shouldFree = true;
HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
@@ -341,14 +372,34 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
if (shouldFree)
pfree(tuple);
+ if (result == TM_Updated && lockUpdated)
+ {
+ bool updated = false;
+
+ if (!ItemPointerIndicatesMovedPartitions(&tmfd->ctid))
+ updated = true;
+
+ result = heapam_tuple_lock_internal(relation, otid, snapshot,
+ lockedSlot, cid, *lockmode,
+ LockWaitBlock,
+ TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
+ tmfd, updated);
+
+ if (result == TM_Ok)
+ {
+ tmfd->traversed = true;
+ return TM_Updated;
+ }
+ }
+
return result;
}
static TM_Result
-heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
- TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
- LockWaitPolicy wait_policy, uint8 flags,
- TM_FailureData *tmfd)
+heapam_tuple_lock_internal(Relation relation, ItemPointer tid, Snapshot snapshot,
+ TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
+ LockWaitPolicy wait_policy, uint8 flags,
+ TM_FailureData *tmfd, bool updated)
{
BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
TM_Result result;
@@ -363,16 +414,30 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
tuple_lock_retry:
tuple->t_self = *tid;
- result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
- follow_updates, &buffer, tmfd);
+ if (!updated)
+ {
+ result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
+ follow_updates, &buffer, tmfd);
+ }
+ else
+ {
+ result = TM_Updated;
+ }
if (result == TM_Updated &&
(flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
{
- /* Should not encounter speculative tuple on recheck */
- Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
+ if (!updated)
+ {
+ /* Should not encounter speculative tuple on recheck */
+ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
- ReleaseBuffer(buffer);
+ ReleaseBuffer(buffer);
+ }
+ else
+ {
+ updated = false;
+ }
if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
{
@@ -559,6 +624,16 @@ tuple_lock_retry:
return result;
}
+static TM_Result
+heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
+ TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
+ LockWaitPolicy wait_policy, uint8 flags,
+ TM_FailureData *tmfd)
+{
+ return heapam_tuple_lock_internal(relation, tid, snapshot, slot, cid, mode,
+ wait_policy, flags, tmfd, false);
+}
+
/* ------------------------------------------------------------------------
* DDL related callbacks for heap AM.
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index b3d1a6c3f8f..94bec180814 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -307,7 +307,8 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
GetCurrentCommandId(true),
snapshot, InvalidSnapshot,
true /* wait for commit */ ,
- &tmfd, false /* changingPart */ );
+ &tmfd, false /* changingPart */ ,
+ false, NULL);
switch (result)
{
@@ -356,7 +357,8 @@ simple_table_tuple_update(Relation rel, ItemPointer otid,
GetCurrentCommandId(true),
snapshot, InvalidSnapshot,
true /* wait for commit */ ,
- &tmfd, &lockmode, update_indexes);
+ &tmfd, &lockmode, update_indexes,
+ false, NULL);
switch (result)
{
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a49c3da5b6c..41a955d7072 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -1218,7 +1218,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
*/
static TM_Result
ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
- ItemPointer tupleid, bool changingPart)
+ ItemPointer tupleid, bool changingPart,
+ bool lockUpdated, TupleTableSlot *lockedSlot)
{
EState *estate = context->estate;
@@ -1228,7 +1229,9 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
estate->es_crosscheck_snapshot,
true /* wait for commit */ ,
&context->tmfd,
- changingPart);
+ changingPart,
+ lockUpdated,
+ lockedSlot);
}
/*
@@ -1373,7 +1376,12 @@ ExecDelete(ModifyTableContext *context,
* transaction-snapshot mode transactions.
*/
ldelete:;
- result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart);
+
+ if (!IsolationUsesXactSnapshot())
+ slot = ExecGetReturningSlot(estate, resultRelInfo);
+
+ result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart,
+ !IsolationUsesXactSnapshot(), slot);
switch (result)
{
@@ -1432,81 +1440,28 @@ ldelete:;
EvalPlanQualBegin(context->epqstate);
inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc,
resultRelInfo->ri_RangeTableIndex);
+ ExecCopySlot(inputslot, slot);
- result = table_tuple_lock(resultRelationDesc, tupleid,
- estate->es_snapshot,
- inputslot, estate->es_output_cid,
- LockTupleExclusive, LockWaitBlock,
- TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
- &context->tmfd);
+ Assert(context->tmfd.traversed);
+ epqslot = EvalPlanQual(context->epqstate,
+ resultRelationDesc,
+ resultRelInfo->ri_RangeTableIndex,
+ inputslot);
+ if (TupIsNull(epqslot))
+ /* Tuple not passing quals anymore, exiting... */
+ return NULL;
- switch (result)
+ /*
+ * If requested, skip delete and pass back the updated
+ * row.
+ */
+ if (epqreturnslot)
{
- case TM_Ok:
- Assert(context->tmfd.traversed);
- epqslot = EvalPlanQual(context->epqstate,
- resultRelationDesc,
- resultRelInfo->ri_RangeTableIndex,
- inputslot);
- if (TupIsNull(epqslot))
- /* Tuple not passing quals anymore, exiting... */
- return NULL;
-
- /*
- * If requested, skip delete and pass back the
- * updated row.
- */
- if (epqreturnslot)
- {
- *epqreturnslot = epqslot;
- return NULL;
- }
- else
- goto ldelete;
-
- case TM_SelfModified:
-
- /*
- * This can be reached when following an update
- * chain from a tuple updated by another session,
- * reaching a tuple that was already updated in
- * this transaction. If previously updated by this
- * command, ignore the delete, otherwise error
- * out.
- *
- * See also TM_SelfModified response to
- * table_tuple_delete() above.
- */
- if (context->tmfd.cmax != estate->es_output_cid)
- ereport(ERROR,
- (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
- errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
- errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
- return NULL;
-
- case TM_Deleted:
- /* tuple already deleted; nothing to do */
- return NULL;
-
- default:
-
- /*
- * TM_Invisible should be impossible because we're
- * waiting for updated row versions, and would
- * already have errored out if the first version
- * is invisible.
- *
- * TM_Updated should be impossible, because we're
- * locking the latest version via
- * TUPLE_LOCK_FLAG_FIND_LAST_VERSION.
- */
- elog(ERROR, "unexpected table_tuple_lock status: %u",
- result);
- return NULL;
+ *epqreturnslot = epqslot;
+ return NULL;
}
-
- Assert(false);
- break;
+ else
+ goto ldelete;
}
case TM_Deleted:
@@ -1569,7 +1524,7 @@ ldelete:;
{
ExecForceStoreHeapTuple(oldtuple, slot, false);
}
- else
+ else if (TupIsNull(slot))
{
if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid,
SnapshotAny, slot))
@@ -1838,7 +1793,8 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo,
static TM_Result
ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot,
- bool canSetTag, UpdateContext *updateCxt)
+ bool canSetTag, UpdateContext *updateCxt, bool lockUpdated,
+ TupleTableSlot *lockedSlot)
{
EState *estate = context->estate;
Relation resultRelationDesc = resultRelInfo->ri_RelationDesc;
@@ -1972,7 +1928,8 @@ lreplace:;
estate->es_crosscheck_snapshot,
true /* wait for commit */ ,
&context->tmfd, &updateCxt->lockmode,
- &updateCxt->updateIndexes);
+ &updateCxt->updateIndexes,
+ lockUpdated, lockedSlot);
if (result == TM_Ok)
updateCxt->updated = true;
@@ -2123,7 +2080,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context,
static TupleTableSlot *
ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot,
- bool canSetTag)
+ bool canSetTag, bool locked)
{
EState *estate = context->estate;
Relation resultRelationDesc = resultRelInfo->ri_RelationDesc;
@@ -2176,12 +2133,32 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
}
else
{
+ TupleTableSlot *oldSlot;
+ bool lockUpdated;
+
/* Fill in the slot appropriately */
ExecUpdatePrepareSlot(resultRelInfo, slot, estate);
redo_act:
+ if (!IsolationUsesXactSnapshot() && !locked)
+ {
+ /* Make sure ri_oldTupleSlot is initialized. */
+ if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
+ ExecInitUpdateProjection(context->mtstate,
+ resultRelInfo);
+
+ /* Fetch the most recent version of old tuple. */
+ lockUpdated = true;
+ oldSlot = resultRelInfo->ri_oldTupleSlot;
+ }
+ else
+ {
+ lockUpdated = false;
+ oldSlot = NULL;
+ }
+
result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot,
- canSetTag, &updateCxt);
+ canSetTag, &updateCxt, lockUpdated, oldSlot);
/*
* If ExecUpdateAct reports that a cross-partition update was done,
@@ -2234,12 +2211,12 @@ redo_act:
{
TupleTableSlot *inputslot;
TupleTableSlot *epqslot;
- TupleTableSlot *oldSlot;
if (IsolationUsesXactSnapshot())
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("could not serialize access due to concurrent update")));
+ Assert(!locked);
/*
* Already know that we're going to need to do EPQ, so
@@ -2247,73 +2224,19 @@ redo_act:
*/
inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc,
resultRelInfo->ri_RangeTableIndex);
-
- result = table_tuple_lock(resultRelationDesc, tupleid,
- estate->es_snapshot,
- inputslot, estate->es_output_cid,
- updateCxt.lockmode, LockWaitBlock,
- TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
- &context->tmfd);
-
- switch (result)
- {
- case TM_Ok:
- Assert(context->tmfd.traversed);
-
- epqslot = EvalPlanQual(context->epqstate,
- resultRelationDesc,
- resultRelInfo->ri_RangeTableIndex,
- inputslot);
- if (TupIsNull(epqslot))
- /* Tuple not passing quals anymore, exiting... */
- return NULL;
-
- /* Make sure ri_oldTupleSlot is initialized. */
- if (unlikely(!resultRelInfo->ri_projectNewInfoValid))
- ExecInitUpdateProjection(context->mtstate,
- resultRelInfo);
-
- /* Fetch the most recent version of old tuple. */
- oldSlot = resultRelInfo->ri_oldTupleSlot;
- if (!table_tuple_fetch_row_version(resultRelationDesc,
- tupleid,
- SnapshotAny,
- oldSlot))
- elog(ERROR, "failed to fetch tuple being updated");
- slot = ExecGetUpdateNewTuple(resultRelInfo,
- epqslot, oldSlot);
- goto redo_act;
-
- case TM_Deleted:
- /* tuple already deleted; nothing to do */
- return NULL;
-
- case TM_SelfModified:
-
- /*
- * This can be reached when following an update
- * chain from a tuple updated by another session,
- * reaching a tuple that was already updated in
- * this transaction. If previously modified by
- * this command, ignore the redundant update,
- * otherwise error out.
- *
- * See also TM_SelfModified response to
- * table_tuple_update() above.
- */
- if (context->tmfd.cmax != estate->es_output_cid)
- ereport(ERROR,
- (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION),
- errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
- errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
- return NULL;
-
- default:
- /* see table_tuple_lock call in ExecDelete() */
- elog(ERROR, "unexpected table_tuple_lock status: %u",
- result);
- return NULL;
- }
+ ExecCopySlot(inputslot, oldSlot);
+ Assert(context->tmfd.traversed);
+
+ epqslot = EvalPlanQual(context->epqstate,
+ resultRelationDesc,
+ resultRelInfo->ri_RangeTableIndex,
+ inputslot);
+ if (TupIsNull(epqslot))
+ /* Tuple not passing quals anymore, exiting... */
+ return NULL;
+ slot = ExecGetUpdateNewTuple(resultRelInfo,
+ epqslot, oldSlot);
+ goto redo_act;
}
break;
@@ -2557,7 +2480,7 @@ ExecOnConflictUpdate(ModifyTableContext *context,
*returning = ExecUpdate(context, resultRelInfo,
conflictTid, NULL,
resultRelInfo->ri_onConflict->oc_ProjSlot,
- canSetTag);
+ canSetTag, true);
/*
* Clear out existing tuple, as there might not be another conflict among
@@ -2764,7 +2687,8 @@ lmerge_matched:;
}
ExecUpdatePrepareSlot(resultRelInfo, newslot, context->estate);
result = ExecUpdateAct(context, resultRelInfo, tupleid, NULL,
- newslot, mtstate->canSetTag, &updateCxt);
+ newslot, mtstate->canSetTag, &updateCxt,
+ false, NULL);
if (result == TM_Ok && updateCxt.updated)
{
ExecUpdateEpilogue(context, &updateCxt, resultRelInfo,
@@ -2782,7 +2706,8 @@ lmerge_matched:;
result = TM_Ok;
break;
}
- result = ExecDeleteAct(context, resultRelInfo, tupleid, false);
+ result = ExecDeleteAct(context, resultRelInfo, tupleid, false,
+ false, NULL);
if (result == TM_Ok)
{
ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL,
@@ -3733,7 +3658,7 @@ ExecModifyTable(PlanState *pstate)
/* Now apply the update. */
slot = ExecUpdate(&context, resultRelInfo, tupleid, oldtuple,
- slot, node->canSetTag);
+ slot, node->canSetTag, false);
break;
case CMD_DELETE:
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index fe869c6c184..2d990163487 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -514,7 +514,9 @@ typedef struct TableAmRoutine
Snapshot crosscheck,
bool wait,
TM_FailureData *tmfd,
- bool changingPart);
+ bool changingPart,
+ bool lockUpdated,
+ TupleTableSlot *lockedSlot);
/* see table_tuple_update() for reference about parameters */
TM_Result (*tuple_update) (Relation rel,
@@ -526,7 +528,9 @@ typedef struct TableAmRoutine
bool wait,
TM_FailureData *tmfd,
LockTupleMode *lockmode,
- bool *update_indexes);
+ bool *update_indexes,
+ bool lockUpdated,
+ TupleTableSlot *lockedSlot);
/* see table_tuple_lock() for reference about parameters */
TM_Result (*tuple_lock) (Relation rel,
@@ -1461,11 +1465,13 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
static inline TM_Result
table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid,
Snapshot snapshot, Snapshot crosscheck, bool wait,
- TM_FailureData *tmfd, bool changingPart)
+ TM_FailureData *tmfd, bool changingPart,
+ bool lockUpdated, TupleTableSlot *lockedSlot)
{
return rel->rd_tableam->tuple_delete(rel, tid, cid,
snapshot, crosscheck,
- wait, tmfd, changingPart);
+ wait, tmfd, changingPart,
+ lockUpdated, lockedSlot);
}
/*
@@ -1506,12 +1512,14 @@ static inline TM_Result
table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot,
CommandId cid, Snapshot snapshot, Snapshot crosscheck,
bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode,
- bool *update_indexes)
+ bool *update_indexes, bool lockUpdated,
+ TupleTableSlot *lockedSlot)
{
return rel->rd_tableam->tuple_update(rel, otid, slot,
cid, snapshot, crosscheck,
wait, tmfd,
- lockmode, update_indexes);
+ lockmode, update_indexes,
+ lockUpdated, lockedSlot);
}
/*
--
2.24.3 (Apple Git-128)
Hi Alexander,
Thoughts and feedback are welcome.
I took some preliminary look at the patch. I'm going to need more time
to meditate on the proposed changes and to figure out the performance
impact.
So far I just wanted to let you know that the patch applied OK for me
and passed all the tests. The `else` branch here seems to be redundant
here:
+ if (!updated)
+ {
+ /* Should not encounter speculative tuple on recheck */
+ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
- ReleaseBuffer(buffer);
+ ReleaseBuffer(buffer);
+ }
+ else
+ {
+ updated = false;
+ }
Also I wish there were a little bit more comments since some of the
proposed changes are not that straightforward.
--
Best regards,
Aleksander Alekseev
Hi again,
+ if (!updated) + { + /* Should not encounter speculative tuple on recheck */ + Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); - ReleaseBuffer(buffer); + ReleaseBuffer(buffer); + } + else + { + updated = false; + }
OK, I got confused here. I suggest changing the if(!...) { .. } else {
.. } code to if() { .. } else { .. } here.
--
Best regards,
Aleksander Alekseev
Hi Alexander,
I'm going to need more time to meditate on the proposed changes and to figure out the performance impact.
OK, turned out this patch is slightly more complicated than I
initially thought, but I think I managed to get some vague
understanding of what's going on.
I tried to reproduce the case with concurrently updated tuples you
described on the current `master` branch. I created a new table:
```
CREATE TABLE phonebook(
"id" SERIAL PRIMARY KEY NOT NULL,
"name" NAME NOT NULL,
"phone" INT NOT NULL);
INSERT INTO phonebook ("name", "phone")
VALUES ('Alice', 123), ('Bob', 456), ('Charlie', 789);
```
Then I opened two sessions and attached them with LLDB. I did:
```
(lldb) b heapam_tuple_update
(lldb) c
```
... in both cases because I wanted to see two calls (steps 2 and 4) to
heapam_tuple_update() and check the return values.
Then I did:
```
session1 =# BEGIN;
session2 =# BEGIN;
session1 =# UPDATE phonebook SET name = 'Alex' WHERE name = 'Alice';
```
This update succeeds and I see heapam_tuple_update() returning TM_Ok.
```
session2 =# UPDATE phonebook SET name = 'Alfred' WHERE name = 'Alice';
```
This update hangs on a lock.
```
session1 =# COMMIT;
```
Now session2 unfreezes and returns 'UPDATE 0'. table_tuple_update()
was called once and returned TM_Updated. Also session2 sees an updated
tuple now. So apparently the visibility check (step 3) didn't pass.
At this point I'm slightly confused. I don't see where a performance
improvement is expected, considering that session2 gets blocked until
session1 commits.
Could you please walk me through here? Am I using the right test case
or maybe you had another one in mind? Which steps do you consider
expensive and expect to be mitigated by the patch?
--
Best regards,
Aleksander Alekseev
Hi Aleksander!
Thank you for your efforts reviewing this patch.
On Thu, Jul 7, 2022 at 12:43 PM Aleksander Alekseev
<aleksander@timescale.com> wrote:
I'm going to need more time to meditate on the proposed changes and to figure out the performance impact.
OK, turned out this patch is slightly more complicated than I
initially thought, but I think I managed to get some vague
understanding of what's going on.I tried to reproduce the case with concurrently updated tuples you
described on the current `master` branch. I created a new table:```
CREATE TABLE phonebook(
"id" SERIAL PRIMARY KEY NOT NULL,
"name" NAME NOT NULL,
"phone" INT NOT NULL);INSERT INTO phonebook ("name", "phone")
VALUES ('Alice', 123), ('Bob', 456), ('Charlie', 789);
```Then I opened two sessions and attached them with LLDB. I did:
```
(lldb) b heapam_tuple_update
(lldb) c
```... in both cases because I wanted to see two calls (steps 2 and 4) to
heapam_tuple_update() and check the return values.Then I did:
```
session1 =# BEGIN;
session2 =# BEGIN;
session1 =# UPDATE phonebook SET name = 'Alex' WHERE name = 'Alice';
```This update succeeds and I see heapam_tuple_update() returning TM_Ok.
```
session2 =# UPDATE phonebook SET name = 'Alfred' WHERE name = 'Alice';
```This update hangs on a lock.
```
session1 =# COMMIT;
```Now session2 unfreezes and returns 'UPDATE 0'. table_tuple_update()
was called once and returned TM_Updated. Also session2 sees an updated
tuple now. So apparently the visibility check (step 3) didn't pass.
Yes. But it's not exactly a visibility check. Session2 re-evaluates
WHERE condition on the most recent row version (bypassing snapshot).
WHERE condition is not true anymore, thus the row is not upated.
At this point I'm slightly confused. I don't see where a performance
improvement is expected, considering that session2 gets blocked until
session1 commits.Could you please walk me through here? Am I using the right test case
or maybe you had another one in mind? Which steps do you consider
expensive and expect to be mitigated by the patch?
This patch is not intended to change some high-level logic. On the
high level transaction, which updated the row, still holding a lock on
it until finished. The possible positive performance impact I expect
from doing the work of two calls tuple_update() and tuple_lock() in
the one call of tuple_update(). If we do this in one call, we can
save some efforts, for instance lock the same buffer once not twice.
------
Regards,
Alexander Korotkov
Hi, hackers!
I ran the following benchmark on master branch (15) vs patch (15-lock):
On the 36-vcore AWS server, I've run an UPDATE-only pgbench script with 50
connections on pgbench_tellers with 100 rows. The idea was to introduce as
much as possible concurrency for updates but avoid much clients being in a
wait state.
Indexes were not built to avoid index-update-related delays.
Done 2 runs each consisting of 6 series of updates (1st run:
master-patch-master-patch-master-patch, 2nd run
patch-master-patch-master-patch-master)
Each series started a fresh server and did VACUUM FULL to avoid bloating
heap relation after the previous series to affect the current. It collected
data for 10 minutes with first-minute data being dropped.
Disk-related operations were suppressed where possible (WAL, fsync etc.)
postgresql.conf:
fsync = off
autovacuum = off
full_page_writes = off
max_worker_processes = 99
max_parallel_workers = 99
max_connections = 100
shared_buffers = 4096MB
work_mem = 50MB
Attached are pictures of 2 runs, shell script, and SQL script that were
running.
According to htop all 36-cores were loaded to ~94% in each series
I'm not sure how to interpret the results. Seems like a TPS difference
between runs is significant, with average performance with lock-patch *(15lock)
*seeming a little bit faster than the master* (15)*.
Could someone try to repeat this on another server? What do you think?
--
Best regards,
Pavel Borisov,
Supabase, https://supabase.com/
Attachments:
pgbench-run-2.pngimage/png; name=pgbench-run-2.pngDownload
�PNG
IHDR J � ��Sr biCCPICC Profile H��WXS��[RIh�P���D�@J-��TATBH(1&;*���E�������
�Z��(��XPQ���
�7!]}�{�����3g�S2s� :�|�,� _Z ��a�MMc�P
���(d���h e�����
��\rUq}?�_E_(R @�!�*�7�d� ��Po3�@��b�
�0@�g�p�/S�L5��o���� 2���g����BA6��~��T(��c q�@�B�����I*<bGh/�x����8����9���gbu^�B�(dy�i�gi����)|��F�#�U��^����4����1��ZC�N"T� �*VF&��Q3����� ��Q�A.�����3�$�<��jA�J
x���E��
�z�����%�r4sk��~�*�enG�],�
��.'�@L �J�c ���@�����������2^�-�l�4"D���g���5��|�@�X�X�����qb��>�N�?~c��DRN� �H16z �(4L�;�*�&i����
B�5s�eyq{�,��P��!6U&h��#��T������Du�xFT�:�D.,��-L9@��U��G��A6W�f`FJ��>@�"P���B��4�U?]AV�ha��\��|��oe�,���d�j$�y�X�`S�}��@M�F��e�X����Hb8� 7�q<>�as����@�_� �m���+�v����b�7����?\�q������� ;d���)p�=�={A-W�*w���s0��j����QP�%����Lmgm�AUE���:����rG������B�G}k�-��a'���i� VX��;�R��5��
x���'�H�����TUR�V����Q3
DST�;I6M.��8�+ b���aCY�n�� ��)���+f��a���+~
@������]4���-������a�:0�T�@)/T�p�� �:pG� `aF����`F�X�R�Xg1\�r0� sA (��j�l[�����Ap� g�p����<��
�E���b�X"v�����@$�F��T$�F�����C���:d3R���@�"��6�r�D^"P���9j�G�(�B��h6:-B��K�
�
����G����}��` �������1.��aY����b�XV�5����ua�q"��Y�+\��x.�'�����:|^�����{x7��@'�\~a,!�0�PB('l#�'������H$2�D�S�9�����
���&b���D"��\H�X�T@*!�%�"!]$u������dwr89�,%���;�������]���KR�Q�R�R)�)�^���@M��P�R+�������WZZZ�Z�Zc�$Zs�*��h�������Os�qi�4%m m;��v���N�����i��z5��.��6C{�6O[�=[�R�N���s���Gg�N�N��>��:]�]{]�._w�n���k�=z�z�z�z��v���{�O���������L�c�0�cc+�8���h�`�3�1(3������P���0�p�a��!�v&��g��y�������F�F#��"�Z��Fo����K�w_1�`�2 3�5YnRor�7u6c:�t��q��!C�����;��j�lo6�l��9�s�s��Z�c�]L�`��U�-:-����U�G,��YV������2���RZm�j���v�N�.��m}��j����Ye�l�mki;�v�m��M;��Nl�����[{������8;��jn;��';V9^v":��r�68]pF��������]Po��������C�C��^s��r\]k\�
c�V<�~�������/~r�g7/�<��n�F��5�xD�������J��t�p��
/<]<E�=�{1�F{��������[�]���c���������^�>�K�
���{�����_��^���]�s�w�?�0R4r������������������AUA��m�����s�89�]��!n!���!o�~����P,4"�4�5L?,)l]��p����������M��������x�<���=�g��Q-Q����uQ�����������F�};�.FSby�+c��9�M��}qL���1��G���?��H���3�MbH���[I�I���d�������)�)+R��;s��T�TIjC)-9m[Z���q��u�{���_�0~���L'�M84Qg"��BFJ�����X~�'���>�[��<W ;E���YY+��dd������]�d��ENd��������s��R�v���3�H�����I��Nj���Jd���&���-��oS ����xx?�tT.P�+,�,|7%y���zS�S�Ms��h�����������gX��;��L�����Y���g���?�cN��s�ss��Q�V������y�������`A����y�����_(Y���c��E�K��g�����>.,>���+~�[���u����������.Z�c����V�^Y����t���W�.�,����F���"��a���ek?���RR�{���E��nn��1xc�&�Me�>�$������uU�U�[�[
�<�����������n+��i�t{���-�>��;�v.�Ak�5���w]�%���Z���������=�=O�����������j��m�~���:�nZ]w����!��������������V+Zz�zx���#EGz�dM]G��>h��|���c�[����:~�D��c'9'��
8u����g�g��z��;�un�^�o�n�;�s��������m�/]<z)������g��\i��t����k����������f���[snn����S~��n��N�n�n?t/���� �o=<x�P��c��G�G��-W?qr�3����qO;����v����������;��s�c�;^�_��\���������{�z�������������O~H���w�G���ON�?G}�����'����G64+������� ���w�~A���~�V����Z�����&