Disallow cancellation of waiting for synchronous replication
Hi, hackers!
This is continuation of thread [0]/messages/by-id/B70260F9-D0EC-438D-9A59-31CB996B320A@yandex-team.ru in pgsql-general with proposed changes. As Maksim pointed out, this topic was raised before here [1]/messages/by-id/CAEET0ZHG5oFF7iEcbY6TZadh1mosLmfz1HLm311P9VOt7Z+jeg@mail.gmail.com.
Currently, we can have split brain with the combination of following steps:
0. Setup cluster with synchronous replication. Isolate primary from standbys.
1. Issue upsert query INSERT .. ON CONFLICT DO NOTHING
2. CANCEL 1 during wait for synchronous replication
3. Retry 1. Idempotent query will succeed and client have confirmation of written data, while it is not present on any standby.
Thread [0]/messages/by-id/B70260F9-D0EC-438D-9A59-31CB996B320A@yandex-team.ru contain reproduction from psql.
In certain situations we cannot avoid cancelation of timed out queries. Yes, we can interpret warnings and thread them as errors, but warning is emitted on step 1, not on step 3.
I think proper solution here would be to add GUC to disallow cancellation of synchronous replication. Retry step 3 will wait on locks after hanging 1 and data will be consistent.
Three is still a problem when backend is not canceled, but terminated [2]https://www.postgresql.org/docs/current/warm-standby.html#SYNCHRONOUS-REPLICATION-HA. Ideal solution would be to keep locks on changed data. Some well known databases threat termination of synchronous replication as system failure and refuse to operate until standbys appear (see Maximum Protection mode). From my point of view it's enough to PANIC once so that HA tool be informed that something is going wrong.
Anyway situation with cancelation is more dangerous. We've observed it in some user cases.
Please find attached draft of proposed change.
Best regards, Andrey Borodin.
[0]: /messages/by-id/B70260F9-D0EC-438D-9A59-31CB996B320A@yandex-team.ru
[1]: /messages/by-id/CAEET0ZHG5oFF7iEcbY6TZadh1mosLmfz1HLm311P9VOt7Z+jeg@mail.gmail.com
[2]: https://www.postgresql.org/docs/current/warm-standby.html#SYNCHRONOUS-REPLICATION-HA
Attachments:
0001-Disallow-cancelation-of-syncronous-commit-V1.patchapplication/octet-stream; name=0001-Disallow-cancelation-of-syncronous-commit-V1.patch; x-unix-mode=0644Download
From 115d5d428a6c7dac8d5d99ff441a91f9988ec793 Mon Sep 17 00:00:00 2001
From: Andrey <amborodin@acm.org>
Date: Fri, 20 Dec 2019 09:36:20 +0500
Subject: [PATCH] Disallow cancelation of syncronous commit V1
Currently we allow to cancel awaiting of syncronous commit.
Some drivers cancel query after timeout. If application will retry
idempotent query, it will get confirmation of written data.
This can lead to split-brain in HA scenarios. To prevent it this
we add synchronous_commit_cancelation setting disalowing cancelation
of syncronous replication wait.
---
src/backend/access/transam/xact.c | 1 +
src/backend/replication/syncrep.c | 45 ++++++++++++++++++++++---------
src/backend/utils/misc/guc.c | 9 +++++++
src/include/access/xact.h | 3 +++
4 files changed, 45 insertions(+), 13 deletions(-)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 5353b6ab0b..844ce36d79 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -80,6 +80,7 @@ bool DefaultXactDeferrable = false;
bool XactDeferrable;
int synchronous_commit = SYNCHRONOUS_COMMIT_ON;
+bool synchronous_commit_cancelation = true;
/*
* When running as a parallel worker, we place only a single
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index 7bf7a1b7f8..96f0d5e8e5 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -250,13 +250,23 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
*/
if (ProcDiePending)
{
- ereport(WARNING,
- (errcode(ERRCODE_ADMIN_SHUTDOWN),
- errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
- errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
- whereToSendOutput = DestNone;
- SyncRepCancelWait();
- break;
+ if (synchronous_commit_cancelation)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
+ errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
+ whereToSendOutput = DestNone;
+ SyncRepCancelWait();
+ break;
+ }
+ else
+ {
+ ereport(PANIC,
+ (errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
+ errdetail("The transaction has already committed locally and might not have been replicated to the standby.")));
+ }
}
/*
@@ -267,12 +277,21 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
*/
if (QueryCancelPending)
{
- QueryCancelPending = false;
- ereport(WARNING,
- (errmsg("canceling wait for synchronous replication due to user request"),
- errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
- SyncRepCancelWait();
- break;
+ if (synchronous_commit_cancelation)
+ {
+ QueryCancelPending = false;
+ ereport(WARNING,
+ (errmsg("canceling wait for synchronous replication due to user request"),
+ errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
+ SyncRepCancelWait();
+ break;
+ }
+ else
+ {
+ ereport(WARNING,
+ (errmsg("canceling wait for synchronous replication due requested, but cancelation is not allowed"),
+ errdetail("The transaction has already committed locally and might not have been replicated to the standby. We must wait here.")));
+ }
}
/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8d951ce404..c30aad97b3 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1222,6 +1222,15 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
+ {
+ {"synchronous_commit_cancelation", PGC_USERSET, WAL_SETTINGS,
+ gettext_noop("Allow to cancel waiting for replication of transaction commited localy."),
+ NULL
+ },
+ &synchronous_commit_cancelation,
+ true, NULL, NULL, NULL
+ },
+
{
{"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs each checkpoint."),
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 9d2899dea1..5443c25edf 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -81,6 +81,9 @@ typedef enum
/* Synchronous commit level */
extern int synchronous_commit;
+/* Allow cancelation of queries waiting for sync replication but commited locally */
+extern bool synchronous_commit_cancelation;
+
/*
* Miscellaneous flag bits to record events which occur on the top level
* transaction. These flags are only persisted in MyXactFlags and are intended
--
2.20.1
On Fri, Dec 20, 2019 at 6:04 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
I think proper solution here would be to add GUC to disallow cancellation of synchronous replication. Retry step 3 will wait on locks after hanging 1 and data will be consistent.
Three is still a problem when backend is not canceled, but terminated [2]. Ideal solution would be to keep locks on changed data. Some well known databases threat termination of synchronous replication as system failure and refuse to operate until standbys appear (see Maximum Protection mode). From my point of view it's enough to PANIC once so that HA tool be informed that something is going wrong.
Sending a cancellation is currently the only way to resume after
disabling synchronous replication. Some HA solutions (e.g.
pg_auto_failover) rely on this behaviour. Would it be worth checking
whether synchronous replication is still required?
Marco
20 дек. 2019 г., в 12:23, Marco Slot <marco@citusdata.com> написал(а):
On Fri, Dec 20, 2019 at 6:04 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
I think proper solution here would be to add GUC to disallow cancellation of synchronous replication. Retry step 3 will wait on locks after hanging 1 and data will be consistent.
Three is still a problem when backend is not canceled, but terminated [2]. Ideal solution would be to keep locks on changed data. Some well known databases threat termination of synchronous replication as system failure and refuse to operate until standbys appear (see Maximum Protection mode). From my point of view it's enough to PANIC once so that HA tool be informed that something is going wrong.Sending a cancellation is currently the only way to resume after
disabling synchronous replication. Some HA solutions (e.g.
pg_auto_failover) rely on this behaviour. Would it be worth checking
whether synchronous replication is still required?
I think changing synchronous_standby_names to some available standbys will resume all backends waiting for synchronous replication.
Do we need to check necessity of synchronous replication in any other case?
Best regards, Andrey Borodin.
Andrey Borodin <x4mmm@yandex-team.ru> writes:
I think proper solution here would be to add GUC to disallow cancellation of synchronous replication.
This sounds entirely insane to me. There is no possibility that you
can prevent a failure from occurring at this step.
Three is still a problem when backend is not canceled, but terminated [2].
Exactly. If you don't have a fix that handles that case, you don't have
anything. In fact, you've arguably made things worse, by increasing the
temptation to terminate or "kill -9" the nonresponsive session.
regards, tom lane
On Fri, Dec 20, 2019 at 03:07:26PM +0500, Andrey Borodin wrote:
Sending a cancellation is currently the only way to resume after
disabling synchronous replication. Some HA solutions (e.g.
pg_auto_failover) rely on this behaviour. Would it be worth checking
whether synchronous replication is still required?I think changing synchronous_standby_names to some available
standbys will resume all backends waiting for synchronous
replication. Do we need to check necessity of synchronous
replication in any other case?
Yeah, I am not on board with the concept of this thread. Depending
on your HA configuration you can also reset synchronous_standby_names
after a certain small-ish threshold has been reached in WAL to get at
the same result by disabling synchronous replication, though your
cluster cannot perform safely a failover so you need to keep track of
that state. Something which would be useful is to improve some cases
where you still want to use synchronous replication by switching to a
different standby. I recall that sometimes that can be rather slow..
--
Michael
On Fri, Dec 20, 2019 at 11:07 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
I think changing synchronous_standby_names to some available standbys will resume all backends waiting for synchronous replication.
Do we need to check necessity of synchronous replication in any other case?
The GUCs are not re-checked in the main loop in SyncRepWaitForLSN, so
backends will remain stuck there even if synchronous replication has
been (temporarily) disabled while they were waiting.
I do agree with the general sentiment that terminating the connection
is preferable over sending a response to the client (except when
synchronous replication was already disabled). Synchronous replication
does not guarantee that a committed write is actually on any replica,
but it does in general guarantee that a commit has been replicated
before sending a response to the client. That's arguably more
important because the rest of what the application might depend on the
transaction completing and replicating successfully. I don't know of
cases other than cancellation in which a response is sent to the
client without replication when synchronous replication is enabled.
The error level should be FATAL instead of PANIC, since PANIC restarts
the database and I don't think there is a reason to do that.
Marco
21 дек. 2019 г., в 2:19, Tom Lane <tgl@sss.pgh.pa.us> написал(а):
Andrey Borodin <x4mmm@yandex-team.ru> writes:
I think proper solution here would be to add GUC to disallow cancellation of synchronous replication.
This sounds entirely insane to me. There is no possibility that you
can prevent a failure from occurring at this step.
Yes, we cannot prevent failure. If we wait here for a long time and someone cancels query, probably, node is failed. Database already lives in some other Availability Zone.
All we should do - refuse to commit anything here. Any committed data will be lost.
Three is still a problem when backend is not canceled, but terminated [2].
Exactly. If you don't have a fix that handles that case, you don't have
anything. In fact, you've arguably made things worse, by increasing the
temptation to terminate or "kill -9" the nonresponsive session.
Currently, any Postgres HA solution can loose data when application issues INSERT ... ON CONFLICT DO NOTHING with retry. There is no need for any DBA mistake. Just a driver capable of issuing cancel on timeout.
Administrator issuing kill -9 is OK, database must shutdown to prevent splitbrain. Preferably, database should refuse to start after shutdown.
I'm not proposing this behavior as default. If administrator (or HA tool) configured DB in this mode - probably they know what they are doing.
21 дек. 2019 г., в 15:34, Marco Slot <marco@citusdata.com> написал(а):
On Fri, Dec 20, 2019 at 11:07 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
I think changing synchronous_standby_names to some available standbys will resume all backends waiting for synchronous replication.
Do we need to check necessity of synchronous replication in any other case?The GUCs are not re-checked in the main loop in SyncRepWaitForLSN, so
backends will remain stuck there even if synchronous replication has
been (temporarily) disabled while they were waiting.
SyncRepInitConfig() will be called on SIGHUP. Backends waiting for synchronous replication will wake up on WAIT_EVENT_SYNC_REP and happily succeed.
I do agree with the general sentiment that terminating the connection
is preferable over sending a response to the client (except when
synchronous replication was already disabled). Synchronous replication
does not guarantee that a committed write is actually on any replica,
but it does in general guarantee that a commit has been replicated
before sending a response to the client. That's arguably more
important because the rest of what the application might depend on the
transaction completing and replicating successfully. I don't know of
cases other than cancellation in which a response is sent to the
client without replication when synchronous replication is enabled.The error level should be FATAL instead of PANIC, since PANIC restarts
the database and I don't think there is a reason to do that.
Terminating connection is unacceptable, actually. Client will retry idempotent query. This query now do not need to write anything and will be committed.
We need to shutdown database and prevent it from starting. We should not acknowledge any data before synchronous replication configuration allows us.
When client tries to cancel his query - we refuse to do so and hold his write locks. If anyone terminate connection - locks will be released. It is better to shut down whole DB, then release these locks and continue to receive queries.
All this does not apply to simple cases when user accidentally enabled synchronous replication. This is a setup for quite sophisticated HA tool, which will rewind local database, when transient network partition will be over and old timeline is archived, and attach it to new primary.
Best regards, Andrey Borodin.
On 21.12.2019 00:19, Tom Lane wrote:
Three is still a problem when backend is not canceled, but terminated [2].
Exactly. If you don't have a fix that handles that case, you don't have
anything. In fact, you've arguably made things worse, by increasing the
temptation to terminate or "kill -9" the nonresponsive session.
I assume that the termination of backend that causes termination of
PostgreSQL instance in Andrey's patch proposal have to be resolved by
external HA agents that could interrupt such terminations as parent
process of postmaster and make appropriate decisions e.g., restart
PostgreSQL node in closed from external users state (via pg_hba.conf
manipulation) until all sync replicas synchronize changes from master.
Stolon HA tool implements this strategy [1]. This logic (waiting for
all replicas declared in synchronous_standby_names replicate all WAL
from master) could be implemented inside PostgreSQL kernel after start
recovery process before database is opened to users and this can be done
separately later.
Another approach is to implement two-phase commit over master and sync
replicas (as it did Oracle in old versions [2]) where the risk to get
local committed data under instance restarting and query canceling is
minimal (after starting of final commitment phase). But this approach
has latency penalty and complexity to resolve partial (prepared but not
committed) transactions under coordinator (in this case master node)
failure in automatic mode. Nicely if this approach will be implemented
later as option of synchronous commit.
2.
https://docs.oracle.com/cd/B28359_01/server.111/b28326/repmaster.htm#i33607
--
Best regards,
Maksim Milyutin
On 21.12.2019 13:34, Marco Slot wrote:
I do agree with the general sentiment that terminating the connection
is preferable over sending a response to the client (except when
synchronous replication was already disabled).
But in this case locally committed data becomes visible to new incoming
transactions that is bad side-effect of this issue. Under failover those
changes potentially undo.
Synchronous replication
does not guarantee that a committed write is actually on any replica,
but it does in general guarantee that a commit has been replicated
before sending a response to the client. That's arguably more
important because the rest of what the application might depend on the
transaction completing and replicating successfully. I don't know of
cases other than cancellation in which a response is sent to the
client without replication when synchronous replication is enabled.
Yes, at query canceling (e.g. by timeout from client driver) client
receives response about completed transaction (though with warning which
not all client drivers can handle properly) and the guarantee about
successfully replicated transaction *violates*.
--
Best regards,
Maksim Milyutin
25 дек. 2019 г., в 15:28, Maksim Milyutin <milyutinma@gmail.com> написал(а):
Synchronous replication
does not guarantee that a committed write is actually on any replica,
but it does in general guarantee that a commit has been replicated
before sending a response to the client. That's arguably more
important because the rest of what the application might depend on the
transaction completing and replicating successfully. I don't know of
cases other than cancellation in which a response is sent to the
client without replication when synchronous replication is enabled.Yes, at query canceling (e.g. by timeout from client driver) client receives response about completed transaction (though with warning which not all client drivers can handle properly) and the guarantee about successfully replicated transaction *violates*.
We obviously need a design discussion here to address the issue. But the immediate question is should we add this topic to January CF items?
Best regards, Andrey Borodin.
On Wed, Dec 25, 2019, 11:28 Maksim Milyutin <milyutinma@gmail.com> wrote:
But in this case locally committed data becomes visible to new incoming
transactions that is bad side-effect of this issue.
Your application should be prepared for that in any case.
At the point where synchronous replication waits, the commit has already
been written to disk on the primary. If postgres restarts while waiting for
replication then the write becomes immediately visible regardless of
whether it was replicated. I don't think throwing a PANIC actually prevents
that and if it does it's coincidental. Best you can do is signal to the
client that the commit status is unknown.
That's far from ideal, but fixing it requires a much bigger change to
streaming replication. The write should be replicated prior to commit on
the primary, but applied after in a way where unapplied writes on the
secondary can be overwritten/discarded if it turns out they did not commit
on the primary.
Marco
On 25.12.2019 14:27, Marco Slot wrote:
On Wed, Dec 25, 2019, 11:28 Maksim Milyutin <milyutinma@gmail.com
<mailto:milyutinma@gmail.com>> wrote:But in this case locally committed data becomes visible to new
incoming
transactions that is bad side-effect of this issue.Your application should be prepared for that in any case.
At the point where synchronous replication waits, the commit has
already been written to disk on the primary. If postgres
restarts while waiting for replication then the write becomes
immediately visible regardless of whether it was replicated.
Yes, this write is recovered after starting of instance. At first, this
case I want to delegate to external HA tool around PostgreSQL. It can
handle instance stopping and take switchover to sync replica or start
current instance with closed connections from external users until all
writes replicate to sync replicas. Later, arguably closing connection
after recovery process could be implemented inside the kernel.
I don't think throwing a PANIC actually prevents that and if it does
it's coincidental.
PANIC lets down instance and doesn't provide clients to read locally
committed data. HA tool takes further steps to close access to these
data as described above.
That's far from ideal, but fixing it requires a much bigger change to
streaming replication. The write should be replicated prior to commit
on the primary, but applied after in a way where unapplied writes on
the secondary can be overwritten/discarded if it turns out they did
not commit on the primary.
Thanks for sharing your opinion about enhancement of synchronous commit
protocol. Here [1] my position is listed. It would like to see positions
of other members of community.
1.
/messages/by-id/f3ffc220-e601-cc43-3784-f9bba66dc382@gmail.com
--
Best regards,
Maksim Milyutin
On 25.12.2019 13:45, Andrey Borodin wrote:
25 дек. 2019 г., в 15:28, Maksim Milyutin <milyutinma@gmail.com> написал(а):
Synchronous replication
does not guarantee that a committed write is actually on any replica,
but it does in general guarantee that a commit has been replicated
before sending a response to the client. That's arguably more
important because the rest of what the application might depend on the
transaction completing and replicating successfully. I don't know of
cases other than cancellation in which a response is sent to the
client without replication when synchronous replication is enabled.Yes, at query canceling (e.g. by timeout from client driver) client receives response about completed transaction (though with warning which not all client drivers can handle properly) and the guarantee about successfully replicated transaction *violates*.
We obviously need a design discussion here to address the issue. But the immediate question is should we add this topic to January CF items?
+1 on posting this topic to January CF.
Andrey, some fixes from me:
1) pulled out the cancelling of QueryCancelPending from internal branch
where synchronous_commit_cancelation is set so that to avoid dummy
iterations with printing message "canceling the wait for ..."
2) rewrote errdetail message under cancelling query: I hold in this case
we cannot assert that transaction committed locally because its changes
are not visible as yet so I propose to write about locally flushed
commit wal record.
Updated patch is attached.
--
Best regards,
Maksim Milyutin
Attachments:
0001-Disallow-cancelation-of-syncronous-commit-V2.patchtext/x-patch; charset=UTF-8; name=0001-Disallow-cancelation-of-syncronous-commit-V2.patchDownload
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 5353b6ab0b..844ce36d79 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -80,6 +80,7 @@ bool DefaultXactDeferrable = false;
bool XactDeferrable;
int synchronous_commit = SYNCHRONOUS_COMMIT_ON;
+bool synchronous_commit_cancelation = true;
/*
* When running as a parallel worker, we place only a single
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index 7bf7a1b7f8..0a533d2d79 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -250,13 +250,21 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
*/
if (ProcDiePending)
{
- ereport(WARNING,
+ if (synchronous_commit_cancelation)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
+ errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
+ whereToSendOutput = DestNone;
+ SyncRepCancelWait();
+ break;
+ }
+
+ ereport(PANIC,
(errcode(ERRCODE_ADMIN_SHUTDOWN),
- errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
- errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
- whereToSendOutput = DestNone;
- SyncRepCancelWait();
- break;
+ errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"),
+ errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
}
/*
@@ -268,11 +276,18 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
if (QueryCancelPending)
{
QueryCancelPending = false;
+ if (synchronous_commit_cancelation)
+ {
+ ereport(WARNING,
+ (errmsg("canceling wait for synchronous replication due to user request"),
+ errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
+ SyncRepCancelWait();
+ break;
+ }
+
ereport(WARNING,
- (errmsg("canceling wait for synchronous replication due to user request"),
- errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
- SyncRepCancelWait();
- break;
+ (errmsg("canceling wait for synchronous replication due requested, but cancelation is not allowed"),
+ errdetail("The COMMIT record has already flushed to WAL locally and might not have been replicated to the standby. We must wait here.")));
}
/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 4b4911d5ec..a279faaeb5 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1222,6 +1222,15 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
+ {
+ {"synchronous_commit_cancelation", PGC_USERSET, WAL_SETTINGS,
+ gettext_noop("Allow to cancel waiting for replication of transaction commited localy."),
+ NULL
+ },
+ &synchronous_commit_cancelation,
+ true, NULL, NULL, NULL
+ },
+
{
{"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs each checkpoint."),
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 9d2899dea1..5443c25edf 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -81,6 +81,9 @@ typedef enum
/* Synchronous commit level */
extern int synchronous_commit;
+/* Allow cancelation of queries waiting for sync replication but commited locally */
+extern bool synchronous_commit_cancelation;
+
/*
* Miscellaneous flag bits to record events which occur on the top level
* transaction. These flags are only persisted in MyXactFlags and are intended
On Fri, Dec 20, 2019 at 12:04 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
Currently, we can have split brain with the combination of following steps:
0. Setup cluster with synchronous replication. Isolate primary from standbys.
1. Issue upsert query INSERT .. ON CONFLICT DO NOTHING
2. CANCEL 1 during wait for synchronous replication
3. Retry 1. Idempotent query will succeed and client have confirmation of written data, while it is not present on any standby.
It seems to me that in order for synchronous replication to work
reliably, you've got to be very careful about any situation where a
commit might or might not have completed, and this is one such
situation. When the client sends the query cancel, it does not and
cannot know whether the INSERT statement has not yet completed, has
already completed but not yet replicated, or has completed and
replicated but not yet sent back a response. However, the server's
response will be different in each of those cases, because in the
second case, there will be a WARNING about synchronous replication
having been interrupted. If the client ignores that WARNING, there are
going to be problems.
Now, even if you do pay attention to the warning, things are not
totally great here, because if you have inadvertently interrupted a
replication wait, how do you recover? You can't send a command that
means "oh, I want to wait after all." You would have to check the
standbys yourself, from the application code, and see whether the
changes that the query made have shown up there, or check the LSN on
the master and wait for the standbys to advance to that LSN. That's
not great, but might be doable for some applications.
One idea that I had during the initial discussion around synchronous
replication was that maybe there ought to be a distinction between
trying to cancel the query and trying to cancel the replication wait.
Imagine that you could send a cancel that would only cancel
replication waits but not queries, or only queries but not replication
waits. Then you could solve this problem by asking the server to
PQcancelWithAdvancedMagic(conn, PQ_CANCEL_TYPE_QUERY). I wasn't sure
that people would want this, and it didn't seem essential for the
version of this feature, but maybe this example shows that it would be
worthwhile. I don't really have any idea how you'd integrate such a
feature with psql, but maybe it would be good enough to have it
available through the C interface. Also, it's a wire-protocol change,
so there are compatibility issues to think about.
All that being said, like Tom and Michael, I don't think teaching the
backend to ignore cancels is the right approach. We have had
innumerable problems over the years that were caused by the backend
failing to respond to cancels when we would really have liked it to do
so, and users REALLY hate it when you tell them that they have to shut
down and restart (with crash recovery) the entire database because of
a single stuck backend.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 29.12.2019 00:55, Robert Haas wrote:
On Fri, Dec 20, 2019 at 12:04 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
Currently, we can have split brain with the combination of following steps:
0. Setup cluster with synchronous replication. Isolate primary from standbys.
1. Issue upsert query INSERT .. ON CONFLICT DO NOTHING
2. CANCEL 1 during wait for synchronous replication
3. Retry 1. Idempotent query will succeed and client have confirmation of written data, while it is not present on any standby.All that being said, like Tom and Michael, I don't think teaching the
backend to ignore cancels is the right approach. We have had
innumerable problems over the years that were caused by the backend
failing to respond to cancels when we would really have liked it to do
so, and users REALLY hate it when you tell them that they have to shut
down and restart (with crash recovery) the entire database because of
a single stuck backend.
The stuckness of backend is not deadlock here. To cancel waiting of
backend fluently, client is enough to turn off synchronous replication
(change synchronous_standby_names through server reload) or change
synchronous replica to another livable one (again through changing of
synchronous_standby_names). In first case he explicitly agrees with
existence of local (not replicated) commits in master.
--
Best regards,
Maksim Milyutin
On Sat, Dec 28, 2019 at 6:19 PM Maksim Milyutin <milyutinma@gmail.com> wrote:
The stuckness of backend is not deadlock here. To cancel waiting of
backend fluently, client is enough to turn off synchronous replication
(change synchronous_standby_names through server reload) or change
synchronous replica to another livable one (again through changing of
synchronous_standby_names). In first case he explicitly agrees with
existence of local (not replicated) commits in master.
Sure, that's true. But I still maintain that responding to ^C is an
important property of the system. If you have to do some more
complicated set of steps like the ones you propose here, a decent
number of people aren't going to figure it out and will end up
unhappy. Now, as it is, you're unhappy, so I guess you can't please
everyone, but you asked for opinions so I'm giving you mine.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
29 дек. 2019 г., в 4:54, Robert Haas <robertmhaas@gmail.com> написал(а):
On Sat, Dec 28, 2019 at 6:19 PM Maksim Milyutin <milyutinma@gmail.com> wrote:
The stuckness of backend is not deadlock here. To cancel waiting of
backend fluently, client is enough to turn off synchronous replication
(change synchronous_standby_names through server reload) or change
synchronous replica to another livable one (again through changing of
synchronous_standby_names). In first case he explicitly agrees with
existence of local (not replicated) commits in master.Sure, that's true. But I still maintain that responding to ^C is an
important property of the system.
Not loosing data - is a nice property of the database either.
Currently, synchronous replication fails to provide its guaranty - no data will be acknowledged until it is replicated.
We want to create a mode where this guaranty is provided.
When user issued CANCEL we could return him his warning or error, but we should not drop data locks. Other transactions should not get acknowledged on basis of non-replicated data.
If you have to do some more
complicated set of steps like the ones you propose here, a decent
number of people aren't going to figure it out and will end up
unhappy. Now, as it is, you're unhappy, so I guess you can't please
everyone, but you asked for opinions so I'm giving you mine.
There are many cases when we do not allow user to shoot into his foot. For example, anti-wraparound vacuum. Single-user vacuum freeze is much less pain than split-brain. In case of wraparound protection, there is deterministic steps to take to get your database back consistently. But in case of split-brain there is no single plan for cure.
Best regards, Andrey Borodin.
On Sat, Dec 28, 2019 at 04:55:55PM -0500, Robert Haas wrote:
On Fri, Dec 20, 2019 at 12:04 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
Currently, we can have split brain with the combination of following steps:
0. Setup cluster with synchronous replication. Isolate primary from standbys.
1. Issue upsert query INSERT .. ON CONFLICT DO NOTHING
2. CANCEL 1 during wait for synchronous replication
3. Retry 1. Idempotent query will succeed and client have confirmation of written data, while it is not present on any standby.It seems to me that in order for synchronous replication to work
reliably, you've got to be very careful about any situation where a
commit might or might not have completed, and this is one such
situation. When the client sends the query cancel, it does not and
cannot know whether the INSERT statement has not yet completed, has
already completed but not yet replicated, or has completed and
replicated but not yet sent back a response. However, the server's
response will be different in each of those cases, because in the
second case, there will be a WARNING about synchronous replication
having been interrupted. If the client ignores that WARNING, there are
going to be problems.
This gets to the heart of something I was hoping to discuss. When is
something committed? You would think it is when the client receives the
commit message, but Postgres can commit something, and try to inform the
client but fail to inform, perhaps due to network problems. In Robert's
case above, we send a "success", but it is only a success on the primary
and not on the synchronous standby.
In the first case I mentioned, we commit without guaranteeing the client
knows, but in the second case, we tell the client success with a warning
that the synchronous standby didn't get the commit. Are clients even
checking warning messages? You see it in psql, but what about
applications that use Postgres. Do they even check for warnings?
Should administrators be informed via email or some command when this
happens?
--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com
+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +
On Sun, Dec 29, 2019 at 4:13 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
Not loosing data - is a nice property of the database either.
Sure, but there's more than one way to fix that problem, as I pointed
out in my first response.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Mon, Dec 30, 2019 at 9:39 AM Bruce Momjian <bruce@momjian.us> wrote:
This gets to the heart of something I was hoping to discuss. When is
something committed? You would think it is when the client receives the
commit message, but Postgres can commit something, and try to inform the
client but fail to inform, perhaps due to network problems.
This kind of problem can happen even without synchronous replication.
I've alluded to this problem in a couple of blog posts I've done on
sync rep.
If an application is connected to the database and sends a COMMIT
command (or a data-modifying command outside a transaction that will
commit implicitly) and the connection is closed before it receives a
response, it does not know whether the COMMIT actually happened. It
will have to wait until the database is back up and running and then
go examine the state of the database with SELECT statements and try to
figure out whether the changes it wanted actually got made. Otherwise
it doesn't know whether the failure that resulted in a loss of network
connectivity occurred before or after the commit.
I imagine that most applications are way too dumb to do this properly
and just report errors to the user and let the user decide what to do
to try to recover. And I imagine that most users are not terribly
careful about it and such events cause minor data loss/corruption on a
regular basis. But there are also probably some applications where
people are really fanatical about it.
In Robert's
case above, we send a "success", but it is only a success on the primary
and not on the synchronous standby.In the first case I mentioned, we commit without guaranteeing the client
knows, but in the second case, we tell the client success with a warning
that the synchronous standby didn't get the commit. Are clients even
checking warning messages? You see it in psql, but what about
applications that use Postgres. Do they even check for warnings?
Should administrators be informed via email or some command when this
happens?
I suspect a lot of clients are not checking warning messages, but
whether that's really the server's problem is arguable. I think we've
developed a general practice over the years of trying to avoid warning
messages as a way of telling users about problems, and that's a good
idea in general precisely because they might just get ignored, but
there are cases where it is really the only possible way forward. It
would certainly be pretty bad to have the COMMIT succeed on the local
node but produce an ERROR; that would doubtless be much more confusing
than what it's doing now. There's nothing at all to prevent
administrators from watching the logs for such warnings and taking
whatever action they deem appropriate.
I continue to think that the root cause of this issue is that we can't
distinguish between cancelling the query and cancelling the sync rep
wait. The client in this case is asking for both when it really only
wants the former, and then ignoring the warning that the latter is
what actually occurred.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
2 янв. 2020 г., в 19:13, Robert Haas <robertmhaas@gmail.com> написал(а):
On Sun, Dec 29, 2019 at 4:13 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
Not loosing data - is a nice property of the database either.
Sure, but there's more than one way to fix that problem, as I pointed
out in my first response.
Sorry, it took some more reading iterations of your message for me to understand the problem you are writing about.
You proposed two solutions:
1. Client analyze warning an understand that data is not actually committed. This, as you pointed out, does not solve the problem: data is lost for another client, who never saw the warning.
Actually, "client" is a stateless number of connections unable to communicate with each other by any means beside database. They cannot share information about not committed transactions (they would need a database, thus chicken and the egg problem).
2. Add another message "CANCEL --force" to stop synchronous replication for specific backend.
We already have a way to stop synchronous replication "alter system set synchronous_standby_names to 'working.stand.by'; select pg_reload_conf();". This will stop it for every backend, but "CANCEL --force" will be more picky.
User still can loose data when they issue idempotent query based on data, committed by "CANCEL --force". Moreover, user can loose data if his upsert is based on data committed by someone else with "set synchronous_commit to off".
We could fix upserts: make them wait for replication even if nothing was changed, but this will not cover the case when user is doing SELECT and decides not to insert anything.
We can fix SELECT: if user asks for synchronous_commit=remote_write - give him snapshot no newer than synchronously committed data. ISTM this would solve all above problems, but I do not see implications of this approach. We should add all XIDs to XIP if their commit LSN > sync rep LSN. But I'm not sure all other transactional mechanics will be OK with this.
From practical point of view - when all writing queries use same synchronous_commit level - easiest solution is to just disallow cancel of sync replication. In psql we can just reset connection on second CTRL+C. That's more generic than "CANCEL --force".
When all queries runs with same synchronous_commit there is no point in protocol message for canceling sync rep for single connection. Just drop that connection. Ignoring cancel is the only way to satisfy synchronous_commit level, which is constant for transaction.
When queries run in various synchronous_commit - things are much more complicated. Adding protocol message to change synchronous_commit for running queries does not seems to be a viable option.
I continue to think that the root cause of this issue is that we can't
distinguish between cancelling the query and cancelling the sync rep
wait.
Yes, it is. But canceling sync rep wait exists already. Just change synchronous_stanby_names. Canceling sync rep for one client - is, effectively, changing synchronous commit level for running transaction. It opens a way for way more difficult complications.
The client in this case is asking for both when it really only
wants the former, and then ignoring the warning that the latter is
what actually occurred.
Client is not ignoring warnings. Data is lost for the client which never received warning. If we could just fix our code, I would not be making so much noise. There are workarounds, but they are very pleasant to explain.
Best regards, Andrey Borodin.
On Thu, Jan 2, 2020 at 10:26:16PM +0500, Andrey Borodin wrote:
2 янв. 2020 г., в 19:13, Robert Haas <robertmhaas@gmail.com> написал(а):
On Sun, Dec 29, 2019 at 4:13 AM Andrey Borodin <x4mmm@yandex-team.ru> wrote:
Not loosing data - is a nice property of the database either.
Sure, but there's more than one way to fix that problem, as I pointed
out in my first response.Sorry, it took some more reading iterations of your message for me to understand the problem you are writing about.
You proposed two solutions:
1. Client analyze warning an understand that data is not actually committed. This, as you pointed out, does not solve the problem: data is lost for another client, who never saw the warning.
Actually, "client" is a stateless number of connections unable to communicate with each other by any means beside database. They cannot share information about not committed transactions (they would need a database, thus chicken and the egg problem).
Actually, it might be worse than that. In my reading of
RecordTransactionCommit(), we do this:
write to WAL
flush WAL (durable)
make visible to other backends
replicate
communicate to the client
I think this means we make the transaction commit visible to all
backends _before_ we replicate it, and potentially wait until we get a
replication reply to return SUCCESS to the client. This means other
clients are acting on data that is durable on the local machine, but not
on the replicated machine, even if synchronous_standby_names is set.
I feel this topic needs a lot more thought before we consider changing
anything.
--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com
+ As you are, so once was I. As I am, so you will be. +
+ Ancient Roman grave inscription +
11 янв. 2020 г., в 7:34, Bruce Momjian <bruce@momjian.us> написал(а):
Actually, it might be worse than that. In my reading of
RecordTransactionCommit(), we do this:write to WAL
flush WAL (durable)
make visible to other backends
replicate
communicate to the clientI think this means we make the transaction commit visible to all
backends _before_ we replicate it, and potentially wait until we get a
replication reply to return SUCCESS to the client.
No. Data is not visible to other backend when we await sync rep. It's easy to check.
in one psql you can start waiting for sync rep:
postgres=# \d+ x
Table "public.x"
Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
--------+---------+-----------+----------+---------+----------+--------------+-------------
key | integer | | not null | | plain | |
data | text | | | | extended | |
Indexes:
"x_pkey" PRIMARY KEY, btree (key)
Access method: heap
postgres=# alter system set synchronous_standby_names to 'nonexistent';
ALTER SYSTEM
postgres=# select pg_reload_conf();
2020-01-12 16:09:58.167 +05 [45677] LOG: received SIGHUP, reloading configuration files
pg_reload_conf
----------------
t
(1 row)
postgres=# insert into x values (7, '7');
In other try to see inserted (already locally committed data)
postgres=# select * from x where key = 7;
key | data
-----+------
(0 rows)
try to insert same data and backend will hand on locks
postgres=# insert into x values (7,'7') on conflict do nothing;
ProcessQuery (in postgres) + 189 [0x1014b05bd]
standard_ExecutorRun (in postgres) + 301 [0x101339fcd]
ExecModifyTable (in postgres) + 1106 [0x101362b62]
ExecInsert (in postgres) + 494 [0x10136344e]
ExecCheckIndexConstraints (in postgres) + 570 [0x10133910a]
check_exclusion_or_unique_constraint (in postgres) + 977 [0x101338db1]
XactLockTableWait (in postgres) + 176 [0x101492770]
LockAcquireExtended (in postgres) + 1274 [0x101493aaa]
Thanks!
Best regards, Andrey Borodin.
Hi,
On 2020-01-12 16:18:38 +0500, Andrey Borodin wrote:
11 янв. 2020 г., в 7:34, Bruce Momjian <bruce@momjian.us> написал(а):
Actually, it might be worse than that. In my reading of
RecordTransactionCommit(), we do this:write to WAL
flush WAL (durable)
make visible to other backends
replicate
communicate to the clientI think this means we make the transaction commit visible to all
backends _before_ we replicate it, and potentially wait until we get a
replication reply to return SUCCESS to the client.No. Data is not visible to other backend when we await sync rep.
Yea, as the relevant comment in RecordTransactionCommit() says;
* Note that at this stage we have marked clog, but still show as running
* in the procarray and continue to hold locks.
*/
if (wrote_xlog && markXidCommitted)
SyncRepWaitForLSN(XactLastRecEnd, true);
But it's worthwhile to emphasize that data at that stage actually *can*
be visible on standbys. The fact that the transaction still shows as
running via procarray, on the primary, does not influence visibility
determinations on the standby.
Greetings,
Andres Freund
On 15.01.2020 01:53, Andres Freund wrote:
On 2020-01-12 16:18:38 +0500, Andrey Borodin wrote:
11 янв. 2020 г., в 7:34, Bruce Momjian <bruce@momjian.us> написал(а):
Actually, it might be worse than that. In my reading of
RecordTransactionCommit(), we do this:write to WAL
flush WAL (durable)
make visible to other backends
replicate
communicate to the clientI think this means we make the transaction commit visible to all
backends _before_ we replicate it, and potentially wait until we get a
replication reply to return SUCCESS to the client.No. Data is not visible to other backend when we await sync rep.
Yea, as the relevant comment in RecordTransactionCommit() says;
* Note that at this stage we have marked clog, but still show as running
* in the procarray and continue to hold locks.
*/
if (wrote_xlog && markXidCommitted)
SyncRepWaitForLSN(XactLastRecEnd, true);But it's worthwhile to emphasize that data at that stage actually *can*
be visible on standbys. The fact that the transaction still shows as
running via procarray, on the primary, does not influence visibility
determinations on the standby.
In common case, consistent reading in cluster (even if remote_apply is
on) is available (and have to be) only on master node. For example, if
random load balancer on read-only queries is established above master
and sync replicas (while meeting remote_apply is on) it's possible to
catch the case when preceding reads would return data that will be
absent on subsequent ones.
Moreover, such visible commits on sync standby are not durable from the
point of cluster view. For example, if we have two sync standbys then
under failover we can switch master to sync standby on which waiting
commit was not replicated but it was applied (and visible) on other
standby. This switching is completely safe because client haven't
received acknowledge on commit request and that transaction is in
indeterminate state, it can be as committed so aborted depending on
which standby will be promoted.
--
Best regards,
Maksim Milyutin
Hello.
Just want to share some thoughts about how it looks from perspective
of a high availability web-service application developer.
Because sometimes things look different from other sides. And
everything looks like disaster to be honest.
But let's take it one at a time.
First - the problem is not related to upsert queries only. It could be
reproduced by plain INSERTS or UPDATES. For example:
* client 1 inserts new records and waits for synchronous replication
* client 1 cancels the query
* clients 2, 3, 4 and 5 see new data and perform some actions outside
of the database in external systems
* master is switched to the replica with no WAL of new records replicated yet
As a result: newly inserted data are just gone, but external systems
already rely on it.
And this is just a huge pain for the application and its developer.
Second - it is all not about the client who canceled the query. It may
be super clever and totally understand all of the tricky aspects and
risks of such action.
But it is about *other* clients who become able to see the
"non-existing" data. They even have no option to detect such
situation.
Yes, currently there are a few ways for non-synchronous-replicated
data to become visible (for complex reasons of course):
1) client cancels the query while waiting synchronous replications
2) database restart
3) kill -9 of backend waiting synchronous replications
What is the main difference among 1 vs 2 and 3? Because 1 is performed
not only by humans.
And moreover it is performed mostly by applications. And it happens
right now on thousands on servers!
Check [1]https://github.com/pgjdbc/pgjdbc/blob/23cce8ad35d9af6e2a1cb97fac69fdc0a7f94b42/pgjdbc/src/main/java/org/postgresql/core/QueryExecutorBase.java#L164-L200 and [2]https://github.com/pgjdbc/pgjdbc/blob/ed09fd1165f046ae956bf21b6c7882f1267fb8d7/pgjdbc/src/main/java/org/postgresql/jdbc/PgStatement.java#L538-L540. It is official JDBC driver for PostgreSQL. I am
sure it is the most popular way to communicate with PostgreSQL these
days.
And implementation of Statement::setQueryTimeout creates timer to send
cancellation after the timeout. It is official and recommended way to
limit statement execution to some interval in JDBC.
In my project (and its libraries) it is used in dozens of places. It
is also possible to search GitHub [4]https://github.com/search?l=Java&q=statement.setQueryTimeout&type=Code to understand how widely it's
used.
For example it is used by Spring framework[5]https://github.com/spring-projects/spring-framework/blob/master/spring-jdbc/src/main/java/org/springframework/jdbc/datasource/DataSourceUtils.java#L329-L343, probably the most
popular framework in the world for the rank-2 programming language.
And situation is even worse. What is the case when setQueryTimeout
starts to cancel queries during synchronous replication like crazy?
Yes, it is the moment of losing connection between master and sync
replica (because all backends are now stuck in synrep). New master
will be elected in a few seconds.... (or maybe already elected and
working).
And... Totally correct code cancels hundreds of queries stuck in
synrep making "non-existing" data available to be read for other
clients in same availability zone....
It is just nightmare to be honest.
These days almost every web-service needs HA for postgres. And
practically if your code (or some of library code) calls
Statement::setQueryTimeout - your HA (for example - Patroni) is
broken.
And it is really not easy to control setQueryTimeout call in modern
application with thousands of third-party libraries. Also, a lot of
applications are in the support phase.
As for me - I am going to hack postgres jdbc driver to ignore
setQueryTimeout at all for now.
I think proper solution here would be to add GUC to disallow cancellation of synchronous replication.
This sounds entirely insane to me. There is no possibility that you
can prevent a failure from occurring at this step.
Yes, maybe it is insane but looks like whole java-postgres-HA (and may
be others) world is going down. So, I believe we should even backport
such an insane knob.
As developers of distributed systems we don't have many things to rely
on. And they are:
1) if database clearly says something is committed - it is committed
with ACID guarantees
2) anything else - it may be committed, may be not committed, may be
waiting to be committed
And we've just lost letter D from ACID practically.
Thanks,
Michail.
[1]: https://github.com/pgjdbc/pgjdbc/blob/23cce8ad35d9af6e2a1cb97fac69fdc0a7f94b42/pgjdbc/src/main/java/org/postgresql/core/QueryExecutorBase.java#L164-L200
[2]: https://github.com/pgjdbc/pgjdbc/blob/ed09fd1165f046ae956bf21b6c7882f1267fb8d7/pgjdbc/src/main/java/org/postgresql/jdbc/PgStatement.java#L538-L540
[3]: https://docs.oracle.com/javase/7/docs/api/java/sql/Statement.html#setQueryTimeout(int)
[4]: https://github.com/search?l=Java&q=statement.setQueryTimeout&type=Code
[5]: https://github.com/spring-projects/spring-framework/blob/master/spring-jdbc/src/main/java/org/springframework/jdbc/datasource/DataSourceUtils.java#L329-L343
On Sat, 2019-12-21 at 11:34 +0100, Marco Slot wrote:
The GUCs are not re-checked in the main loop in SyncRepWaitForLSN, so
backends will remain stuck there even if synchronous replication has
been (temporarily) disabled while they were waiting.
If you do:
alter system set synchronous_standby_names='';
select pg_reload_conf();
it will release the backends waiting on sync rep.
Regards,
Jeff Davis
9 июня 2020 г., в 23:32, Jeff Davis <pgsql@j-davis.com> написал(а):
After using a patch for a while it became obvious that PANICing during termination is not a good idea. Even when we wait for synchronous replication. It generates undesired coredumps.
I think in presence of SIGTERM it's reasonable to say that we cannot protect user anymore.
PFA v3.
Best regards, Andrey Borodin.
Attachments:
v3-0001-Disallow-cancelation-of-syncronous-commit.patchapplication/octet-stream; name=v3-0001-Disallow-cancelation-of-syncronous-commit.patch; x-unix-mode=0644Download
From afbcde937f09a47115fd7241b632999274f89b8e Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Tue, 27 Oct 2020 14:21:21 +0500
Subject: [PATCH v3] Disallow cancelation of syncronous commit
Currently we allow to cancel awaiting of syncronous commit.
Some drivers cancel query after timeout. If application will retry
idempotent query, it will get confirmation of written data.
This can lead to split-brain in HA scenarios. To prevent it this
we add synchronous_commit_cancelation setting disalowing cancelation
of syncronous replication wait
---
src/backend/access/transam/xact.c | 1 +
src/backend/replication/syncrep.c | 15 +++++++++++----
src/backend/utils/misc/guc.c | 9 +++++++++
src/include/access/xact.h | 3 +++
4 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 9cd0b7c11b..61d0c4555d 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -81,6 +81,7 @@ bool DefaultXactDeferrable = false;
bool XactDeferrable;
int synchronous_commit = SYNCHRONOUS_COMMIT_ON;
+bool synchronous_commit_cancelation = true;
/*
* CheckXidAlive is a xid value pointing to a possibly ongoing (sub)
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index 6e8c76537a..3c005a24ba 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -282,11 +282,18 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
if (QueryCancelPending)
{
QueryCancelPending = false;
+ if (synchronous_commit_cancelation)
+ {
+ ereport(WARNING,
+ (errmsg("canceling wait for synchronous replication due to user request"),
+ errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
+ SyncRepCancelWait();
+ break;
+ }
+
ereport(WARNING,
- (errmsg("canceling wait for synchronous replication due to user request"),
- errdetail("The transaction has already committed locally, but might not have been replicated to the standby.")));
- SyncRepCancelWait();
- break;
+ (errmsg("canceling wait for synchronous replication due requested, but cancelation is not allowed"),
+ errdetail("The COMMIT record has already flushed to WAL locally and might not have been replicated to the standby. We must wait here.")));
}
/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dabcbb0736..b174e6d471 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1289,6 +1289,15 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
+ {
+ {"synchronous_commit_cancelation", PGC_USERSET, WAL_SETTINGS,
+ gettext_noop("Allow to cancel waiting for replication of transaction commited localy."),
+ NULL
+ },
+ &synchronous_commit_cancelation,
+ true, NULL, NULL, NULL
+ },
+
{
{"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT,
gettext_noop("Logs each checkpoint."),
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 7320de345c..9d9a6877d5 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -86,6 +86,9 @@ extern int synchronous_commit;
extern PGDLLIMPORT TransactionId CheckXidAlive;
extern PGDLLIMPORT bool bsysscan;
+/* Allow cancelation of queries waiting for sync replication but commited locally */
+extern bool synchronous_commit_cancelation;
+
/*
* Miscellaneous flag bits to record events which occur on the top level
* transaction. These flags are only persisted in MyXactFlags and are intended
--
2.24.3 (Apple Git-128)
On 12/9/20 4:07 AM, Andrey Borodin wrote:
9 июня 2020 г., в 23:32, Jeff Davis <pgsql@j-davis.com> написал(а):
After using a patch for a while it became obvious that PANICing during termination is not a good idea. Even when we wait for synchronous replication. It generates undesired coredumps.
I think in presence of SIGTERM it's reasonable to say that we cannot protect user anymore.PFA v3.
Maksim, Michail, thoughts on this new patch?
Regards,
--
-David
david@pgmasters.net
On 2020/12/09 18:07, Andrey Borodin wrote:
9 июня 2020 г., в 23:32, Jeff Davis <pgsql@j-davis.com> написал(а):
After using a patch for a while it became obvious that PANICing during termination is not a good idea. Even when we wait for synchronous replication. It generates undesired coredumps.
I think in presence of SIGTERM it's reasonable to say that we cannot protect user anymore.PFA v3.
I don't think that preventing a backend from being canceled during waiting for
sync rep actually addresses your issue. As mentioned upthread, there are
other cases that can cause the issue, for example, restart of the server while
backends are waiting for sync rep.
As far as I understand your idea, what we should do is to make new transaction
wait until WAL has been replicated to the standby up to the latest WAL record
committed locally before starting? We don't need to prevent the cancellation
during sync rep wait.
If we do that, new transaction cannot see any changes by another transaction
that was canceled during sync rep, until all the committed WAL records are
replicated. Doesn't this address your issue? I think that this idea works in
not only cancellation case but also other cases.
If we want to control this new wait in application level, we can implement
something like pg_wait_for_syncrep(pg_lsn) function. This function waits
until WAL is replicated to the standby up to the specified lsn. For example,
we can execute pg_wait_for_syncrep(pg_current_wal_lsn()) in the application
whenever we need that consistent point.
Other idea is to add new GUC. If this GUC is enabled, transaction waits for
all the committed records to be replicated whenever it takes new snapshot
(probably transaction needs to wait not only when starting but also taking
new snapshot). This prevents the transaction from seeing any data that
have not been replicated yet.
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
Thanks for looking into this!
11 марта 2021 г., в 19:15, Fujii Masao <masao.fujii@oss.nttdata.com> написал(а):
On 2020/12/09 18:07, Andrey Borodin wrote:
9 июня 2020 г., в 23:32, Jeff Davis <pgsql@j-davis.com> написал(а):
After using a patch for a while it became obvious that PANICing during termination is not a good idea. Even when we wait for synchronous replication. It generates undesired coredumps.
I think in presence of SIGTERM it's reasonable to say that we cannot protect user anymore.
PFA v3.I don't think that preventing a backend from being canceled during waiting for
sync rep actually addresses your issue. As mentioned upthread, there are
other cases that can cause the issue, for example, restart of the server while
backends are waiting for sync rep.
Well, the patch fully address _my_ issue :) My issue is breaking guarantees of synchronous replication by sending a "cancel" message. Which is send by most drivers automatically.
The patch does not need to address the issue of server restart - it's the job of the HA tool to prevent the start of a database service in a case when a new primary was elected.
The only case patch does not handle is sudden backend crash - Postgres will recover without a restart. I think it is a very small problem compared to "cancel". One needs not only failover but also SIGSEGV in the backend to encounter this problem. Anyway we can address this issue by adding one more GUC preventing PostmasterStateMachine() to invoke crash recovery when (FatalError && pmState == PM_NO_CHILDREN).
As far as I understand your idea, what we should do is to make new transaction
wait until WAL has been replicated to the standby up to the latest WAL record
committed locally before starting? We don't need to prevent the cancellation
during sync rep wait.
If we do that, new transaction cannot see any changes by another transaction
that was canceled during sync rep, until all the committed WAL records are
replicated. Doesn't this address your issue?
Preventing any new transaction from starting during sync replication wait is not really an option. It would double the latency cost of synchronous replication for writing transactions (wait for RTT on start, wait for RTT on commit). And incur the same cost on reading transactions (which did not need it before).
I think that this idea works in
not only cancellation case but also other cases.If we want to control this new wait in application level, we can implement
something like pg_wait_for_syncrep(pg_lsn) function. This function waits
until WAL is replicated to the standby up to the specified lsn. For example,
we can execute pg_wait_for_syncrep(pg_current_wal_lsn()) in the application
whenever we need that consistent point.
We want this for every transaction running with synchronous_commit > local. We should not ask users to run one more "make transaction durable" statement. The "COMMIT" is this statement.
Other idea is to add new GUC. If this GUC is enabled, transaction waits for
all the committed records to be replicated whenever it takes new snapshot
(probably transaction needs to wait not only when starting but also taking
new snapshot). This prevents the transaction from seeing any data that
have not been replicated yet.
If we block new snapshots after local commit until successful replication we, in fact, linearize reads from standbys. The cost will be immense. The whole idea of MVCC is that writers do not block readers.
Thanks for the ideas!
Best regards, Andrey Borodin.
Hi hackers,
After using a patch for a while it became obvious that PANICing during termination is not a good idea. Even when we wait for synchronous replication. It generates undesired coredumps.
I think in presence of SIGTERM it's reasonable to say that we cannot protect user anymore.
PFA v3.
This patch, although solving a concrete and important problem, looks
more like a quick workaround than an appropriate solution. Or is it
just me?
Ideally, the transaction should be committed only after getting a
reply from the standby. If the user cancels the transaction, it
doesn't get committed anywhere. This is what people into distributed
systems would expect unless stated otherwise, at least. Although I
realize how complicated it is to implement, especially considering all
the possible corner cases (netsplit right after getting a reply, etc).
Maybe we could come up with a less than ideal, but still sound and
easy-to-understand model, which, as soon as you learned it, doesn't
bring unexpected surprises to the user.
I believe at this point it's important to agree if the community is
ready to accept a patch as is to make existing users suffer less and
iterate afterward. Or we choose not to do it and to come up with
another idea. Personally, I don't have any better ideas, thus maybe
accepting Andrey's patch would be the lesser of two evils.
--
Best regards,
Aleksander Alekseev
Hi Aleksander!
Thanks for looking into this.
23 апр. 2021 г., в 14:30, Aleksander Alekseev <aleksander@timescale.com> написал(а):
Hi hackers,
After using a patch for a while it became obvious that PANICing during termination is not a good idea. Even when we wait for synchronous replication. It generates undesired coredumps.
I think in presence of SIGTERM it's reasonable to say that we cannot protect user anymore.
PFA v3.This patch, although solving a concrete and important problem, looks
more like a quick workaround than an appropriate solution. Or is it
just me?Ideally, the transaction should be committed only after getting a
reply from the standby.
Getting reply from the standby is a part of a commit. Commit is completed only when WAL reached standby. Commit, certainly, was initiated before getting reply from standby. We cannot commit only after we commit.
If the user cancels the transaction, it
doesn't get committed anywhere.
The problem is user tries to cancel a transaction after they asked for commit. We never promised rolling back committed transaction.
When user asks for commit we insert commit record into WAL. And then wait when it is acknowledged by quorum of standbys and local storage.
We cannot discard this record on standbys. Or, at one point we will have to discard discard records. Or discard discard discard records.
This is what people into distributed
systems would expect unless stated otherwise, at least.
I think, our transaction semantics is stated clearly in documentation.
Although I
realize how complicated it is to implement, especially considering all
the possible corner cases (netsplit right after getting a reply, etc).
Maybe we could come up with a less than ideal, but still sound and
easy-to-understand model, which, as soon as you learned it, doesn't
bring unexpected surprises to the user.
The model proposed by my patch sounds as follows:
transaction effects should not be observable on primary until requirements of synchronous_commit are satisfied.
E.g. even if user issues cancel of committed locally transaction, we should not release locks held by this transaction.
What unexpected surprises do you see in this model?
Thanks for reviewing!
Best regards, Andrey Borodin.
I came across this thread [1]subject was: Re: Disallow cancellation of waiting for synchronous replication thread: /messages/by-id/C1F7905E-5DB2-497D-ABCC-E14D4DEE506C@yandex-team.ru to disallow canceling a transaction not
yet confirmed by a synchronous replica. I think my proposed patch
might help that case as well, hence adding all involved in that thread
to BCC, for one-time notification.
As mentioned in that thread, when sending a cancellation signal, the
client cannot be sure if the cancel signal was honored, and if the
transaction was cancelled successfully. In the attached patch, the
backend emits a NotificationResponse containing the current full
transaction id. It does so only if the relevant GUC is enabled, and
when the top-transaction is being assigned the ID.
This information can be useful to the client, when:
i) it wants to cancel a transaction _after_ issuing a COMMIT, and
ii) it wants to check the status of its transaction that it sent
COMMIT for, but never received a response (perhaps because the server
crashed).
Additionally, this information can be useful for middleware, like
Transaction Processing Monitors, which can now transparently (without
any change in application code) monitor the status of transactions (by
watching for the transaction status indicator in the ReadyForQuery
protocol message). They can use the transaction ID from the
NotificationResponse to open a watcher, and on seeing either an 'E' or
'I' payload in subsequent ReadyForQuery messages, close the watcher.
On server crash, or other adverse events, they can then use the
transaction IDs still being watched to check status of those
transactions, and take appropriate actions, e.g. retry any aborted
transactions.
We cannot use the elog() mechanism for this notification because it is
sensitive to the value of client_min_messages. Hence I used the NOTIFY
infrastructure for this message. I understand that this usage violates
some expectations as to how NOTIFY messages are supposed to behave
(see [2]At present, NotificationResponse can only be sent outside a transaction, and thus it will not occur in the middle of a command-response series, though it might occur just before ReadyForQuery. It is unwise to design frontend logic that assumes that, however. Good practice is to be able to accept NotificationResponse at any point in the protocol. below), but I think these are acceptable violations; open to
hearing if/why this might not be acceptable, and any possible
alternatives.
I'm not very familiar with the parallel workers infrastructure, so the
patch is missing any consideration for those.
Reviews welcome.
[1]: subject was: Re: Disallow cancellation of waiting for synchronous replication thread: /messages/by-id/C1F7905E-5DB2-497D-ABCC-E14D4DEE506C@yandex-team.ru
replication
thread: /messages/by-id/C1F7905E-5DB2-497D-ABCC-E14D4DEE506C@yandex-team.ru
[2]: At present, NotificationResponse can only be sent outside a transaction, and thus it will not occur in the middle of a command-response series, though it might occur just before ReadyForQuery. It is unwise to design frontend logic that assumes that, however. Good practice is to be able to accept NotificationResponse at any point in the protocol.
At present, NotificationResponse can only be sent outside a
transaction, and thus it will not occur in the middle of a
command-response series, though it might occur just before ReadyForQuery.
It is unwise to design frontend logic that assumes that, however.
Good practice is to be able to accept NotificationResponse at any
point in the protocol.
Best regards,
--
Gurjeet Singh http://gurjeet.singh.im/
Attachments:
notify_xid.patchapplication/octet-stream; name=notify_xid.patchDownload
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index bc2a2feb0b..fad49aba82 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -1315,6 +1315,17 @@ SELCT 1/0;<!-- this typo is intentional -->
channel name.
</para>
+ <para>
+ When the parameter <varname>listen_transaction_id</varname> is enabled, at
+ the begining of a transaction (not a subtransaction) the backend sends a
+ NotificationResponse message containing the transaction ID on the <literal>
+ _my_transaction_id<literal> channel. This may be useful for the frontend to
+ use after a crash-recovery, to inspect the state of their transaction from
+ before the crash; see <function>pg_xact_status()</function>. Unlike other
+ NotificationResponse messaegs, the process ID in this message is the same as
+ that of the current backend.
+ </para>
+
<note>
<para>
At present, NotificationResponse can only be sent outside a
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 441445927e..e80dfb3a7d 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -68,6 +68,8 @@
#include "utils/timeout.h"
#include "utils/timestamp.h"
+extern char *FullTransactionIdToStr(FullTransactionId fxid);
+
/*
* User-tweakable parameters
*/
@@ -107,6 +109,7 @@ bool bsysscan = false;
* XactTopFullTransactionId stores the XID of our toplevel transaction, which
* will be the same as TopTransactionStateData.fullTransactionId in an
* ordinary backend; but in a parallel backend, which does not have the entire
+ * ordinary backend; but in a parallel backend, which does not have the entire
* transaction state, it will instead be copied from the backend that started
* the parallel operation.
*
@@ -714,6 +717,21 @@ AssignTransactionId(TransactionState s)
TopTransactionStateData.didLogXid = true;
}
}
+
+ // NOTIFY FrontEnd, if it wants to know the top transaction's ID.
+ if (!isSubXact && listen_transaction_id)
+ {
+ char *xidStr;
+
+ Assert(s->parent == NULL);
+
+ // Should we Assert(!IsParallelWorker()) here?
+
+ xidStr = FullTransactionIdToStr(s->fullTransactionId);
+
+ NotifyMyFrontEnd("_my_transaction_id", xidStr, MyProcPid);
+ pfree(xidStr);
+ }
}
/*
@@ -6005,7 +6023,7 @@ xact_redo(XLogReaderState *record)
}
else if (info == XLOG_XACT_COMMIT_PREPARED)
{
- xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
xl_xact_parsed_commit parsed;
ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed);
diff --git a/src/backend/utils/adt/xid.c b/src/backend/utils/adt/xid.c
index 24c1c93732..755cc53d36 100644
--- a/src/backend/utils/adt/xid.c
+++ b/src/backend/utils/adt/xid.c
@@ -161,13 +161,19 @@ xid8in(PG_FUNCTION_ARGS)
PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromU64(pg_strtouint64(str, NULL, 0)));
}
+char *
+FullTransactionIdToStr(FullTransactionId fxid)
+{
+ char *result = (char *) palloc(21);
+ snprintf(result, 21, UINT64_FORMAT, U64FromFullTransactionId(fxid));
+ return result;
+}
+
Datum
xid8out(PG_FUNCTION_ARGS)
{
FullTransactionId fxid = PG_GETARG_FULLTRANSACTIONID(0);
- char *result = (char *) palloc(21);
-
- snprintf(result, 21, UINT64_FORMAT, U64FromFullTransactionId(fxid));
+ char *result = FullTransactionIdToStr(fxid);
PG_RETURN_CSTRING(result);
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 68b62d523d..1df14b9745 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -947,11 +947,21 @@ static const unit_conversion time_unit_conversion_table[] =
* variable_is_guc_list_quote() in src/bin/pg_dump/dumputils.c.
*/
+bool listen_transaction_id = false;
/******** option records follow ********/
static struct config_bool ConfigureNamesBool[] =
{
+ {
+ {"listen_transaction_id", PGC_USERSET, UNGROUPED,
+ gettext_noop("Emit top-level transaction ID on transaction start."),
+ NULL
+ },
+ &listen_transaction_id,
+ false,
+ NULL, NULL, NULL
+ },
{
{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of sequential-scan plans."),
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index a7c3a4958e..ed86a7a16d 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -278,6 +278,8 @@ extern int tcp_keepalives_interval;
extern int tcp_keepalives_count;
extern int tcp_user_timeout;
+extern bool listen_transaction_id;
+
#ifdef TRACE_SORT
extern bool trace_sort;
#endif