VM map freeze corruption
pg_check_frozen() reports corrupted VM freeze state.
Found with one of my stress tests. Simplified to the repro below.
The reason for the 33 rows/pages is that I wanted to test if a 2nd vacuum freeze repaired the situation. I was confounded till I discovered SKIP_PAGES_THRESHOLD was 32.
My analysis is that heap_prepare_freeze_tuple->FreezeMultiXactId() returns FRM_NOOP if the MultiXACT locked rows haven't committed. This results in changed=false and totally_frozen=true(as initialized). When this returns to lazy_scan_heap(), no rows are added to the frozen[] array. Yet, tuple_totally_frozen is true. This means the page is marked frozen in the VM, even though the MultiXACT row wasn't left untouch.
A fix to heap_prepare_freeze_tuple() that seems to do the trick is:
else
{
Assert(flags & FRM_NOOP);
+ totally_frozen = false;
}
BASH script repro below:
#!/bin/bash
p="psql -h 127.0.0.1 -p 5432 postgres"
echo "create extension pg_visibility;" | $p
$p <<XXX
drop table t;
create table t (i int primary key, c char(7777));
alter table t alter column c set storage plain;
insert into t select generate_series(0, 32, 1), 'XXX';
XXX
# Start two share lockers in the background
$p <<XXX >/dev/null &
begin;
select i, length(c) from t for share;
select pg_sleep(2);
commit;
XXX
$p <<XXX >/dev/null &
begin;
select i, length(c) from t for share;
select pg_sleep(2);
commit;
XXX
# Freeze while multixact locks are held
echo "vacuum freeze t;" | $p
echo "select pg_check_frozen('t');" | $p
sleep 4; # Wait for share locks to be released
# See if another freeze corrects the problem
echo "vacuum freeze t;" | $p
echo "select pg_check_frozen('t');" | $p
On Wed, Apr 18, 2018 at 7:37 AM, Wood, Dan <hexpert@amazon.com> wrote:
My analysis is that heap_prepare_freeze_tuple->FreezeMultiXactId()
returns FRM_NOOP if the MultiXACT locked rows haven't committed. This
results in changed=false and totally_frozen=true(as initialized). When
this returns to lazy_scan_heap(), no rows are added to the frozen[] array.
Yet, tuple_totally_frozen is true. This means the page is marked frozen in
the VM, even though the MultiXACT row wasn't left untouch.A fix to heap_prepare_freeze_tuple() that seems to do the trick is:
else
{
Assert(flags & FRM_NOOP);
+ totally_frozen = false;
}
That's a great find! This can definitely lead to various problems and could
be one of the reasons behind the issue reported here [1]/messages/by-id/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com. For example, if
we change the script slightly at the end, we can get the same error
reported in the bug report.
sleep 4; # Wait for share locks to be released
# See if another vacuum freeze advances relminmxid beyond xmax present in
the
# heap
echo "vacuum (verbose, freeze) t;" | $p
echo "select pg_check_frozen('t');" | $p
# See if a vacuum freeze scanning all pages corrects the problem
echo "vacuum (verbose, freeze, disable_page_skipping) t;" | $p
echo "select pg_check_frozen('t');" | $p
Thanks,
Pavan
[1]: /messages/by-id/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com
/messages/by-id/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com
--
Pavan Deolasee http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
Pavan Deolasee wrote:
On Wed, Apr 18, 2018 at 7:37 AM, Wood, Dan <hexpert@amazon.com> wrote:
My analysis is that heap_prepare_freeze_tuple->FreezeMultiXactId()
returns FRM_NOOP if the MultiXACT locked rows haven't committed. This
results in changed=false and totally_frozen=true(as initialized). When
this returns to lazy_scan_heap(), no rows are added to the frozen[] array.
Yet, tuple_totally_frozen is true. This means the page is marked frozen in
the VM, even though the MultiXACT row wasn't left untouch.A fix to heap_prepare_freeze_tuple() that seems to do the trick is:
else
{
Assert(flags & FRM_NOOP);
+ totally_frozen = false;
}That's a great find!
Indeed.
This family of bugs (introduced by freeze map changes in 9.6) was
initially fixed in 38e9f90a227d but this spot was missed in that fix.
IMO the cause is the totally_frozen variable, which starts life in a
bogus state (true) and the different code paths can set it to the right
state, or by inaction end up deciding that the initial bogus state was
correct in the first place. Errors of omission are far too easy in that
kind of model, ISTM, so I propose this slightly different patch, which
albeit yet untested seems easier to verify and easier to get right.
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
frozen.patchtext/plain; charset=us-asciiDownload
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 4fdb549099..791958a543 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6800,9 +6800,10 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
{
bool changed = false;
- bool freeze_xmax = false;
+ bool xmin_frozen = false; /* is xmin frozen after this? */
+ bool freeze_xmax = false; /* whether to freeze xmax now */
+ bool xmax_frozen = false; /* ix xmax frozen after this? */
TransactionId xid;
- bool totally_frozen = true;
frz->frzflags = 0;
frz->t_infomask2 = tuple->t_infomask2;
@@ -6829,10 +6830,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
frz->t_infomask |= HEAP_XMIN_FROZEN;
changed = true;
+ xmin_frozen = true;
}
- else
- totally_frozen = false;
}
+ else if (HeapTupleHeaderXminFrozen(tuple))
+ xmin_frozen = true;
/*
* Process xmax. To thoroughly examine the current Xmax value we need to
@@ -6870,7 +6872,6 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
if (flags & FRM_MARK_COMMITTED)
frz->t_infomask |= HEAP_XMAX_COMMITTED;
changed = true;
- totally_frozen = false;
}
else if (flags & FRM_RETURN_IS_MULTI)
{
@@ -6892,7 +6893,6 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
frz->xmax = newxmax;
changed = true;
- totally_frozen = false;
}
else
{
@@ -6923,9 +6923,10 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
xid)));
freeze_xmax = true;
}
- else
- totally_frozen = false;
}
+ else if (((tuple->t_infomask & HEAP_XMAX_INVALID) == HEAP_XMAX_INVALID) ||
+ !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
+ xmax_frozen = true;
if (freeze_xmax)
{
@@ -6941,6 +6942,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
changed = true;
+ xmax_frozen = true;
}
/*
@@ -6981,7 +6983,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
}
}
- *totally_frozen_p = totally_frozen;
+ *totally_frozen_p = xmin_frozen && xmax_frozen;
return changed;
}
On Wed, Apr 18, 2018 at 10:36 PM, Alvaro Herrera
<alvherre@alvh.no-ip.org> wrote:
Pavan Deolasee wrote:
On Wed, Apr 18, 2018 at 7:37 AM, Wood, Dan <hexpert@amazon.com> wrote:
My analysis is that heap_prepare_freeze_tuple->FreezeMultiXactId()
returns FRM_NOOP if the MultiXACT locked rows haven't committed. This
results in changed=false and totally_frozen=true(as initialized). When
this returns to lazy_scan_heap(), no rows are added to the frozen[] array.
Yet, tuple_totally_frozen is true. This means the page is marked frozen in
the VM, even though the MultiXACT row wasn't left untouch.A fix to heap_prepare_freeze_tuple() that seems to do the trick is:
else
{
Assert(flags & FRM_NOOP);
+ totally_frozen = false;
}That's a great find!
Indeed.
This family of bugs (introduced by freeze map changes in 9.6) was
initially fixed in 38e9f90a227d but this spot was missed in that fix.IMO the cause is the totally_frozen variable, which starts life in a
bogus state (true) and the different code paths can set it to the right
state, or by inaction end up deciding that the initial bogus state was
correct in the first place. Errors of omission are far too easy in that
kind of model, ISTM, so I propose this slightly different patch, which
albeit yet untested seems easier to verify and easier to get right.
Thank you for the patch!
Agreed. The patch looks to fix this issue correctly.
Regards,
--
Masahiko Sawada
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center
On Wed, Apr 18, 2018 at 7:06 PM, Alvaro Herrera <alvherre@alvh.no-ip.org>
wrote:
IMO the cause is the totally_frozen variable, which starts life in a
bogus state (true) and the different code paths can set it to the right
state, or by inaction end up deciding that the initial bogus state was
correct in the first place. Errors of omission are far too easy in that
kind of model, ISTM, so I propose this slightly different patch, which
albeit yet untested seems easier to verify and easier to get right.
I wonder if we should just avoid initialising those variables at the top
and take compiler's help to detect any missed branches. That way, we shall
know exactly why and where we are making them true/false. I also think that
we should differentiate between scenarios where xmin/xmax is already frozen
and scenarios where we are freezing them now.
I come up with attached. Not fully polished (and there is a XXX that I left
for comments), but hopefully enough to tell what I am thinking. Do you
think it's any improvement or actually makes the problem worse?
Thanks,
Pavan
--
Pavan Deolasee http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
Attachments:
frozen_v2.patchapplication/octet-stream; name=frozen_v2.patchDownload
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 4fdb549099..0040913730 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6800,9 +6800,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
{
bool changed = false;
- bool freeze_xmax = false;
+ bool xmin_already_frozen;
+ bool xmax_already_frozen;
+ bool freeze_xmin;
+ bool freeze_xmax;
TransactionId xid;
- bool totally_frozen = true;
frz->frzflags = 0;
frz->t_infomask2 = tuple->t_infomask2;
@@ -6811,7 +6813,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
/* Process xmin */
xid = HeapTupleHeaderGetXmin(tuple);
- if (TransactionIdIsNormal(xid))
+
+ freeze_xmin = false;
+ if (HeapTupleHeaderXminFrozen(tuple))
+ xmin_already_frozen = true;
+ else if (TransactionIdIsNormal(xid))
{
if (TransactionIdPrecedes(xid, relfrozenxid))
ereport(ERROR,
@@ -6827,11 +6833,15 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
xid, cutoff_xid)));
- frz->t_infomask |= HEAP_XMIN_FROZEN;
- changed = true;
+ freeze_xmin = true;
}
- else
- totally_frozen = false;
+ }
+
+ if (freeze_xmin)
+ {
+ Assert(!xmin_already_frozen);
+ frz->t_infomask |= HEAP_XMIN_FROZEN;
+ changed = true;
}
/*
@@ -6854,9 +6864,9 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
relfrozenxid, relminmxid,
cutoff_xid, cutoff_multi, &flags);
- if (flags & FRM_INVALIDATE_XMAX)
- freeze_xmax = true;
- else if (flags & FRM_RETURN_IS_XID)
+ freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
+
+ if (flags & FRM_RETURN_IS_XID)
{
/*
* NB -- some of these transformations are only valid because we
@@ -6870,7 +6880,6 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
if (flags & FRM_MARK_COMMITTED)
frz->t_infomask |= HEAP_XMAX_COMMITTED;
changed = true;
- totally_frozen = false;
}
else if (flags & FRM_RETURN_IS_MULTI)
{
@@ -6892,15 +6901,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
frz->xmax = newxmax;
changed = true;
- totally_frozen = false;
- }
- else
- {
- Assert(flags & FRM_NOOP);
}
}
else if (TransactionIdIsNormal(xid))
{
+ freeze_xmax = false;
if (TransactionIdPrecedes(xid, relfrozenxid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
@@ -6923,12 +6928,23 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
xid)));
freeze_xmax = true;
}
- else
- totally_frozen = false;
+ }
+ else if (((tuple->t_infomask & HEAP_XMAX_INVALID) == HEAP_XMAX_INVALID) ||
+ !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
+ {
+ freeze_xmax = false;
+ xmax_already_frozen = true;
+ }
+ else
+ {
+ /* XXX ERROR? */
+ freeze_xmax = false;
}
if (freeze_xmax)
{
+ Assert(!xmax_already_frozen);
+
frz->xmax = InvalidTransactionId;
/*
@@ -6981,7 +6997,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
}
}
- *totally_frozen_p = totally_frozen;
+ *totally_frozen_p = (freeze_xmin || xmin_already_frozen) &&
+ (freeze_xmax || xmax_already_frozen);
return changed;
}
Pavan Deolasee wrote:
I wonder if we should just avoid initialising those variables at the top
and take compiler's help to detect any missed branches. That way, we shall
know exactly why and where we are making them true/false. I also think that
we should differentiate between scenarios where xmin/xmax is already frozen
and scenarios where we are freezing them now.
After staring at this code for a while, it strikes me that the xmin case
is different enough from the xmax case that it works better to let the
logic be different; namely for xmin a single bool suffices. I kept your
logic for xmax, except that xmax_already_frozen requires initialization
(to 'false') or we risk having it end up as 'true' due to random garbage
in the stack and set totally_frozen_p to true spuriously because of this
(or spuriously fail the assertion in line 6942). I noticed this when
running Dan's test case with your patch -- the assertion failed for the
xmin case:
TRAP: FailedAssertion(�!(!xmin_already_frozen)�, Archivo: �/pgsql/source/master/src/backend/access/heap/heapam.c�, L�nea: 6845)
That led me to the rewrite of the xmin logic, and it also led me to
looking deeper at the xmax case (with which I didn't run into any
assertion failure, but it's clear that it could definitely happen if the
stack happens to have that bit set.)
I also chose to accept the suggestion in XXX to throw an error:
if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
...
else if (TransactionIdIsNormal(xid))
...
else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
xid, tuple->t_infomask)));
There's no place else in the backend where we report an infomask, but in
this case it seems warranted (for forensics) if this elog ever fires.
The tests involved are:
which means that this ereport could fire is if the transaction is
BootstrapTransactionId or FrozenTransactionId. In either case VACUUM
should have removed the tuple as dead, so these cases shouldn't ever
occur.
(Looking at the other caller for this code, which is cluster.c for table
rewrites, it appears that dead tuples are not passed to
heap_freeze_tuple, so xmax=BootstrapXid/FrozenXid should not be a
problem there either.)
Patch attached. I intend to push this soon, to have it in next week's
set.
--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachments:
frozen_v3.patchtext/plain; charset=us-asciiDownload
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 1a672150be..72395a50b8 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6803,9 +6803,10 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
{
bool changed = false;
- bool freeze_xmax = false;
+ bool xmax_already_frozen = false;
+ bool xmin_frozen;
+ bool freeze_xmax;
TransactionId xid;
- bool totally_frozen = true;
frz->frzflags = 0;
frz->t_infomask2 = tuple->t_infomask2;
@@ -6814,6 +6815,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
/* Process xmin */
xid = HeapTupleHeaderGetXmin(tuple);
+ xmin_frozen = ((xid == FrozenTransactionId) ||
+ HeapTupleHeaderXminFrozen(tuple));
if (TransactionIdIsNormal(xid))
{
if (TransactionIdPrecedes(xid, relfrozenxid))
@@ -6832,9 +6835,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
frz->t_infomask |= HEAP_XMIN_FROZEN;
changed = true;
+ xmin_frozen = true;
}
- else
- totally_frozen = false;
}
/*
@@ -6857,9 +6859,9 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
relfrozenxid, relminmxid,
cutoff_xid, cutoff_multi, &flags);
- if (flags & FRM_INVALIDATE_XMAX)
- freeze_xmax = true;
- else if (flags & FRM_RETURN_IS_XID)
+ freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
+
+ if (flags & FRM_RETURN_IS_XID)
{
/*
* NB -- some of these transformations are only valid because we
@@ -6873,7 +6875,6 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
if (flags & FRM_MARK_COMMITTED)
frz->t_infomask |= HEAP_XMAX_COMMITTED;
changed = true;
- totally_frozen = false;
}
else if (flags & FRM_RETURN_IS_MULTI)
{
@@ -6895,11 +6896,6 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
frz->xmax = newxmax;
changed = true;
- totally_frozen = false;
- }
- else
- {
- Assert(flags & FRM_NOOP);
}
}
else if (TransactionIdIsNormal(xid))
@@ -6927,11 +6923,24 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
freeze_xmax = true;
}
else
- totally_frozen = false;
+ freeze_xmax = false;
}
+ else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
+ !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
+ {
+ freeze_xmax = false;
+ xmax_already_frozen = true;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
+ xid, tuple->t_infomask)));
if (freeze_xmax)
{
+ Assert(!xmax_already_frozen);
+
frz->xmax = InvalidTransactionId;
/*
@@ -6984,7 +6993,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple,
}
}
- *totally_frozen_p = totally_frozen;
+ *totally_frozen_p = (xmin_frozen &&
+ (freeze_xmax || xmax_already_frozen));
return changed;
}