Reduce heap tuple header size
This patch, which is built upon the "HeapTupleHeader accessor macros"
patch from 2002-06-10, is supposed to reduce the heap tuple header size
by four bytes on most architectures. Of course it changes the on-disk
tuple format and therefore requires initdb. As I have (once more)
opened my mouth too wide, I'll have to provide a heap file conversion
utility, if this patch gets accepted... More on this later.
======================
All 81 tests passed.
======================
It's late now, I'll do more tests tomorrow.
Good night
Manfred
diff -ru ../orig/src/backend/access/heap/heapam.c src/backend/access/heap/heapam.c
--- ../orig/src/backend/access/heap/heapam.c 2002-06-13 19:34:48.000000000 +0200
+++ src/backend/access/heap/heapam.c 2002-06-13 22:31:42.000000000 +0200
@@ -2204,7 +2204,7 @@
htup->t_infomask = HEAP_XMAX_INVALID | xlhdr.mask;
HeapTupleHeaderSetXmin(htup, record->xl_xid);
HeapTupleHeaderSetCmin(htup, FirstCommandId);
- HeapTupleHeaderSetXmax(htup, InvalidTransactionId);
+ HeapTupleHeaderSetXmaxInvalid(htup);
HeapTupleHeaderSetCmax(htup, FirstCommandId);
offnum = PageAddItem(page, (Item) htup, newlen, offnum,
diff -ru ../orig/src/include/access/htup.h src/include/access/htup.h
--- ../orig/src/include/access/htup.h 2002-06-13 19:34:49.000000000 +0200
+++ src/include/access/htup.h 2002-06-14 01:12:47.000000000 +0200
@@ -57,15 +57,24 @@
* Also note that we omit the nulls bitmap if t_infomask shows that there
* are no nulls in the tuple.
*/
+/*
+** We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac
+** in three physical fields t_xmin, t_cid, t_xmax:
+** CommandId Cmin; insert CID stamp
+** CommandId Cmax; delete CommandId stamp
+** TransactionId Xmin; insert XID stamp
+** TransactionId Xmax; delete XID stamp
+** TransactionId Xvac; used by VACCUUM
+**
+** This assumes, that a CommandId can be stored in a TransactionId.
+*/
typedef struct HeapTupleHeaderData
{
Oid t_oid; /* OID of this tuple -- 4 bytes */
- CommandId t_cmin; /* insert CID stamp -- 4 bytes each */
- CommandId t_cmax; /* delete CommandId stamp */
-
- TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */
- TransactionId t_xmax; /* delete XID stamp */
+ TransactionId t_xmin; /* Xmin -- 4 bytes each */
+ TransactionId t_cid; /* Cmin, Cmax, Xvac */
+ TransactionId t_xmax; /* Xmax, Cmax */
ItemPointerData t_ctid; /* current TID of this or newer tuple */
@@ -75,7 +84,7 @@
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
- /* ^ - 31 bytes - ^ */
+ /* ^ - 27 bytes - ^ */
bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */
@@ -96,6 +105,8 @@
* attribute(s) */
#define HEAP_HASEXTENDED 0x000C /* the two above combined */
+#define HEAP_XMIN_IS_XMAX 0x0040 /* created and deleted in the */
+ /* same transaction
*/
#define HEAP_XMAX_UNLOGGED 0x0080 /* to lock tuple for update */
/* without logging
*/
#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */
@@ -108,6 +119,7 @@
* vacuum */
#define HEAP_MOVED_IN 0x8000 /* moved from another place by
* vacuum */
+#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)
#define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */
@@ -116,53 +128,100 @@
/* HeapTupleHeader accessor macros */
#define HeapTupleHeaderGetXmin(tup) \
- ((tup)->t_xmin)
+( \
+ (tup)->t_xmin \
+)
#define HeapTupleHeaderGetXmax(tup) \
- ((tup)->t_xmax)
+( \
+ ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) ? \
+ (tup)->t_xmin \
+ : \
+ (tup)->t_xmax \
+)
-/* no AssertMacro, because this is read as a system-defined attribute also */
+/* no AssertMacro, because this is read as a system-defined attribute */
#define HeapTupleHeaderGetCmin(tup) \
( \
- (tup)->t_cmin \
+ ((tup)->t_infomask & HEAP_MOVED) ? \
+ FirstCommandId \
+ : \
+ ( \
+ ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \
+ (CommandId) (tup)->t_cid \
+ : \
+ FirstCommandId \
+ ) \
)
#define HeapTupleHeaderGetCmax(tup) \
- ((tup)->t_cmax)
+( \
+ ((tup)->t_infomask & HEAP_MOVED) ? \
+ FirstCommandId \
+ : \
+ ( \
+ ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \
+ (CommandId) (tup)->t_xmax \
+ : \
+ (CommandId) (tup)->t_cid \
+ ) \
+)
#define HeapTupleHeaderGetXvac(tup) \
( \
- AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \
- (TransactionId) (tup)->t_cmin \
+ AssertMacro((tup)->t_infomask & HEAP_MOVED), \
+ (tup)->t_cid \
)
#define HeapTupleHeaderSetXmin(tup, xid) \
- (TransactionIdStore((xid), &(tup)->t_xmin))
+( \
+ TransactionIdStore((xid), &(tup)->t_xmin) \
+)
#define HeapTupleHeaderSetXminInvalid(tup) \
- (StoreInvalidTransactionId(&(tup)->t_xmin))
+do { \
+ (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \
+ StoreInvalidTransactionId(&(tup)->t_xmin); \
+} while (0)
#define HeapTupleHeaderSetXmax(tup, xid) \
- (TransactionIdStore((xid), &(tup)->t_xmax))
+do { \
+ if (TransactionIdEquals((tup)->t_xmin, (xid))) \
+ (tup)->t_infomask |= HEAP_XMIN_IS_XMAX; \
+ else \
+ { \
+ (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \
+ TransactionIdStore((xid), &(tup)->t_xmax); \
+ } \
+} while (0)
#define HeapTupleHeaderSetXmaxInvalid(tup) \
- (StoreInvalidTransactionId(&(tup)->t_xmax))
+do { \
+ (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \
+ StoreInvalidTransactionId(&(tup)->t_xmax); \
+} while (0)
#define HeapTupleHeaderSetCmin(tup, cid) \
-( \
- AssertMacro(!((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF))), \
- (tup)->t_cmin = (cid) \
-)
+do { \
+ Assert(!((tup)->t_infomask & HEAP_MOVED)); \
+ TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \
+} while (0)
#define HeapTupleHeaderSetCmax(tup, cid) \
- ((tup)->t_cmax = (cid))
+do { \
+ Assert(!((tup)->t_infomask & HEAP_MOVED)); \
+ if ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) \
+ TransactionIdStore((TransactionId) (cid), &(tup)->t_xmax); \
+ else \
+ TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \
+} while (0)
#define HeapTupleHeaderSetXvac(tup, xid) \
-( \
- AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \
- TransactionIdStore((xid), (TransactionId *) &((tup)->t_cmin)) \
-)
+do { \
+ Assert((tup)->t_infomask & HEAP_MOVED); \
+ TransactionIdStore((xid), &(tup)->t_cid); \
+} while (0)
/*
Manfred Koizar <mkoi-pg@aon.at> writes:
This patch, which is built upon the "HeapTupleHeader accessor macros"
patch from 2002-06-10, is supposed to reduce the heap tuple header size
by four bytes on most architectures. Of course it changes the on-disk
tuple format and therefore requires initdb.
As I commented before, I am not in favor of this. I don't think that a
four-byte savings justifies a forced initdb with no chance of
pg_upgrade, plus loss of redundancy (= reduced chance of detecting or
recovering from corruption), plus significantly slower access to
several critical header fields. The tqual.c routines are already
hotspots in many scenarios. I believe this will make them noticeably
slower.
regards, tom lane
On Fri, 14 Jun 2002 10:16:22 -0400, Tom Lane <tgl@sss.pgh.pa.us> wrote:
As I commented before, I am not in favor of this. I don't think that a
four-byte savings justifies a forced initdb with no chance of
pg_upgrade,
As I don't know other users' preferences, I cannot comment on this.
I just think that four bytes per tuple can amount to a sum that justifies
this effort. Disk space is not my argument here, but reduced disk IO is.
plus significantly slower access to
several critical header fields. The tqual.c routines are already
hotspots in many scenarios. I believe this will make them noticeably
slower.
Significantly slower? I tried to analyze HeapTupleSatisfiesUpdate(),
as I think it is the most complicated of these Satisfies functions
(look for the !! comments):
/*
!! HeapTupleHeaderGetXmin no slowdown
!! HeapTupleHeaderGetXmax one infomask compare
!! HeapTupleHeaderGetCmin two infomask compares
!! HeapTupleHeaderGetCMax two infomask compares
!! HeapTupleHeaderGetXvac no slowdown
*/
int
HeapTupleSatisfiesUpdate(HeapTuple htuple, CommandId curcid)
{
HeapTupleHeader tuple = htuple->t_data;
if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED))
{
if (tuple->t_infomask & HEAP_XMIN_INVALID)
/*
!! no slowdown
*/
return HeapTupleInvisible;
if (tuple->t_infomask & HEAP_MOVED_OFF)
{
if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXvac(tuple)))
/*
!! no slowdown
*/
return HeapTupleInvisible;
if (!TransactionIdIsInProgress(HeapTupleHeaderGetXvac(tuple)))
{
if (TransactionIdDidCommit(HeapTupleHeaderGetXvac(tuple)))
{
tuple->t_infomask |= HEAP_XMIN_INVALID;
/*
!! no slowdown
*/
return HeapTupleInvisible;
}
tuple->t_infomask |= HEAP_XMIN_COMMITTED;
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
{
if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXvac(tuple)))
{
if (TransactionIdIsInProgress(HeapTupleHeaderGetXvac(tuple)))
/*
!! no slowdown
*/
return HeapTupleInvisible;
if (TransactionIdDidCommit(HeapTupleHeaderGetXvac(tuple)))
tuple->t_infomask |= HEAP_XMIN_COMMITTED;
else
{
tuple->t_infomask |= HEAP_XMIN_INVALID;
/*
!! no slowdown
*/
return HeapTupleInvisible;
}
}
}
/*
!! no slowdown up to here
*/
else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple)))
{
/*
!! GetCmin does 2 infomask compares
*/
if (HeapTupleHeaderGetCmin(tuple) >= curcid)
/*
!! returning with 2 additional infomask compares
*/
return HeapTupleInvisible; /* inserted after scan
* started */
if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */
/*
!! returning with 2 additional infomask compares
*/
return HeapTupleMayBeUpdated;
/*
!! assertions turned off in production: no slowdown
*/
Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
/*
!! returning with 2 additional infomask compares
*/
return HeapTupleMayBeUpdated;
/*
!! GetCmax does 2 infomask compares
*/
if (HeapTupleHeaderGetCmax(tuple) >= curcid)
/*
!! returning with 4 additional infomask compares
*/
return HeapTupleSelfUpdated; /* updated after scan
* started */
else
/*
!! returning with 4 additional infomask compares
*/
return HeapTupleInvisible; /* updated before scan
* started */
}
/*
!! no slowdown up to here
*/
else if (!TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
{
if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(tuple)))
tuple->t_infomask |= HEAP_XMIN_INVALID; /* aborted */
/*
!! no slowdown
*/
return HeapTupleInvisible;
}
else
tuple->t_infomask |= HEAP_XMIN_COMMITTED;
}
/*
!! no slowdown
*/
/* by here, the inserting transaction has committed */
if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */
/*
!! no slowdown
*/
return HeapTupleMayBeUpdated;
if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
{
if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
/*
!! no slowdown
?? BTW, shouldn't we set HEAP_MAX_INVALID here?
*/
return HeapTupleMayBeUpdated;
/*
!! no slowdown
*/
return HeapTupleUpdated; /* updated by other */
}
/*
!! no slowdown up to here,
!! one infomask compare to follow in GetXmax(),
!! additional waste of precious CPU cycles could be avoided by:
!! TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
*/
if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)))
{
if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
/*
!! returning with 1 additional infomask compare
*/
return HeapTupleMayBeUpdated;
/*
!! GetCmax does 2 infomask compares
*/
if (HeapTupleHeaderGetCmax(tuple) >= curcid)
/*
!! returning with 3 additional infomask compares
*/
return HeapTupleSelfUpdated; /* updated after scan
* started */
else
/*
!! returning with 3 additional infomask compares
*/
return HeapTupleInvisible; /* updated before scan started */
}
/*
!! 1 infomask compare up to here,
!! another infomask compare ...
!! or use xmax
*/
if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
{
/*
!! and a third infomask compare
*/
if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
{
tuple->t_infomask |= HEAP_XMAX_INVALID; /* aborted */
/*
!! returning with 1 or 3 additional infomask compares
*/
return HeapTupleMayBeUpdated;
}
/* running xact */
/*
!! returning with 1 or 3 additional infomask compares
*/
return HeapTupleBeingUpdated; /* in updation by other */
}
/*
!! 2 (or 1 with a local xmax) infomask compares up to here
*/
/* xmax transaction committed */
tuple->t_infomask |= HEAP_XMAX_COMMITTED;
if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
/*
?? BTW, shouldn't we set HEAP_MAX_INVALID here?
*/
return HeapTupleMayBeUpdated;
return HeapTupleUpdated; /* updated by other */
}
So in the worst case we return after having done four more
compares than without the patch. Note that in the most common
cases there is no additional cost at all. If you still think
we have a performance problem here, we could replace GetCmin
and GetCmax by cheaper macros:
#define HeapTupleHeaderGetCminKnowingThatNotMoved(tup) \
( \
AssertMacro(!((tup)->t_infomask & HEAP_MOVED)),
( \
((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \
(CommandId) (tup)->t_cid \
: \
FirstCommandId \
) \
)
thus reducing the additional cost to one t_infomask compare,
because the Satisfies functions only access Cmin and Cmax,
when HEAP_MOVED is known to be not set.
OTOH experimenting with a moderatly sized "out of production"
database I got the following results:
| pages | pages |
relkind | count | tuples | before| after | savings
--------+-------+--------+-------+-------+--------
i | 31 | 808146 | 8330 | 8330 | 0.00%
r | 32 | 612968 | 13572 | 13184 | 2.86%
all | 63 | | 21902 | 21514 | 1.77%
2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages.
Considering that index pages tend to benefit more from caching
we conclude that heap pages contribute more to the overall
IO load, so the total savings in the number of disk IOs should
be better than the 1.77% shown in the table above. I think
this outweighs a few CPU cycles now and then.
Servus
Manfred
Your patch has been added to the PostgreSQL unapplied patches list at:
http://candle.pha.pa.us/cgi-bin/pgpatches
I will try to apply it within the next 48 hours.
---------------------------------------------------------------------------
Manfred Koizar wrote:
This patch, which is built upon the "HeapTupleHeader accessor macros"
patch from 2002-06-10, is supposed to reduce the heap tuple header size
by four bytes on most architectures. Of course it changes the on-disk
tuple format and therefore requires initdb. As I have (once more)
opened my mouth too wide, I'll have to provide a heap file conversion
utility, if this patch gets accepted... More on this later.======================
All 81 tests passed.
======================It's late now, I'll do more tests tomorrow.
Good night
Manfreddiff -ru ../orig/src/backend/access/heap/heapam.c src/backend/access/heap/heapam.c --- ../orig/src/backend/access/heap/heapam.c 2002-06-13 19:34:48.000000000 +0200 +++ src/backend/access/heap/heapam.c 2002-06-13 22:31:42.000000000 +0200 @@ -2204,7 +2204,7 @@ htup->t_infomask = HEAP_XMAX_INVALID | xlhdr.mask; HeapTupleHeaderSetXmin(htup, record->xl_xid); HeapTupleHeaderSetCmin(htup, FirstCommandId); - HeapTupleHeaderSetXmax(htup, InvalidTransactionId); + HeapTupleHeaderSetXmaxInvalid(htup); HeapTupleHeaderSetCmax(htup, FirstCommandId);offnum = PageAddItem(page, (Item) htup, newlen, offnum, diff -ru ../orig/src/include/access/htup.h src/include/access/htup.h --- ../orig/src/include/access/htup.h 2002-06-13 19:34:49.000000000 +0200 +++ src/include/access/htup.h 2002-06-14 01:12:47.000000000 +0200 @@ -57,15 +57,24 @@ * Also note that we omit the nulls bitmap if t_infomask shows that there * are no nulls in the tuple. */ +/* +** We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac +** in three physical fields t_xmin, t_cid, t_xmax: +** CommandId Cmin; insert CID stamp +** CommandId Cmax; delete CommandId stamp +** TransactionId Xmin; insert XID stamp +** TransactionId Xmax; delete XID stamp +** TransactionId Xvac; used by VACCUUM +** +** This assumes, that a CommandId can be stored in a TransactionId. +*/ typedef struct HeapTupleHeaderData { Oid t_oid; /* OID of this tuple -- 4 bytes */- CommandId t_cmin; /* insert CID stamp -- 4 bytes each */ - CommandId t_cmax; /* delete CommandId stamp */ - - TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */ - TransactionId t_xmax; /* delete XID stamp */ + TransactionId t_xmin; /* Xmin -- 4 bytes each */ + TransactionId t_cid; /* Cmin, Cmax, Xvac */ + TransactionId t_xmax; /* Xmax, Cmax */ItemPointerData t_ctid; /* current TID of this or newer tuple */
@@ -75,7 +84,7 @@
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
- /* ^ - 31 bytes - ^ */ + /* ^ - 27 bytes - ^ */bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */
@@ -96,6 +105,8 @@
* attribute(s) */
#define HEAP_HASEXTENDED 0x000C /* the two above combined */+#define HEAP_XMIN_IS_XMAX 0x0040 /* created and deleted in the */ + /* same transaction */ #define HEAP_XMAX_UNLOGGED 0x0080 /* to lock tuple for update */ /* without logging */ #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ @@ -108,6 +119,7 @@ * vacuum */ #define HEAP_MOVED_IN 0x8000 /* moved from another place by * vacuum */ +#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)#define HEAP_XACT_MASK 0xFFF0 /* visibility-related bits */
@@ -116,53 +128,100 @@
/* HeapTupleHeader accessor macros */#define HeapTupleHeaderGetXmin(tup) \ - ((tup)->t_xmin) +( \ + (tup)->t_xmin \ +)#define HeapTupleHeaderGetXmax(tup) \ - ((tup)->t_xmax) +( \ + ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) ? \ + (tup)->t_xmin \ + : \ + (tup)->t_xmax \ +)-/* no AssertMacro, because this is read as a system-defined attribute also */ +/* no AssertMacro, because this is read as a system-defined attribute */ #define HeapTupleHeaderGetCmin(tup) \ ( \ - (tup)->t_cmin \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + FirstCommandId \ + : \ + ( \ + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ + (CommandId) (tup)->t_cid \ + : \ + FirstCommandId \ + ) \ )#define HeapTupleHeaderGetCmax(tup) \ - ((tup)->t_cmax) +( \ + ((tup)->t_infomask & HEAP_MOVED) ? \ + FirstCommandId \ + : \ + ( \ + ((tup)->t_infomask & (HEAP_XMIN_IS_XMAX | HEAP_XMAX_INVALID)) ? \ + (CommandId) (tup)->t_xmax \ + : \ + (CommandId) (tup)->t_cid \ + ) \ +)#define HeapTupleHeaderGetXvac(tup) \ ( \ - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ - (TransactionId) (tup)->t_cmin \ + AssertMacro((tup)->t_infomask & HEAP_MOVED), \ + (tup)->t_cid \ )#define HeapTupleHeaderSetXmin(tup, xid) \ - (TransactionIdStore((xid), &(tup)->t_xmin)) +( \ + TransactionIdStore((xid), &(tup)->t_xmin) \ +)#define HeapTupleHeaderSetXminInvalid(tup) \ - (StoreInvalidTransactionId(&(tup)->t_xmin)) +do { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + StoreInvalidTransactionId(&(tup)->t_xmin); \ +} while (0)#define HeapTupleHeaderSetXmax(tup, xid) \ - (TransactionIdStore((xid), &(tup)->t_xmax)) +do { \ + if (TransactionIdEquals((tup)->t_xmin, (xid))) \ + (tup)->t_infomask |= HEAP_XMIN_IS_XMAX; \ + else \ + { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + TransactionIdStore((xid), &(tup)->t_xmax); \ + } \ +} while (0)#define HeapTupleHeaderSetXmaxInvalid(tup) \ - (StoreInvalidTransactionId(&(tup)->t_xmax)) +do { \ + (tup)->t_infomask &= ~HEAP_XMIN_IS_XMAX; \ + StoreInvalidTransactionId(&(tup)->t_xmax); \ +} while (0)#define HeapTupleHeaderSetCmin(tup, cid) \ -( \ - AssertMacro(!((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF))), \ - (tup)->t_cmin = (cid) \ -) +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ +} while (0)#define HeapTupleHeaderSetCmax(tup, cid) \ - ((tup)->t_cmax = (cid)) +do { \ + Assert(!((tup)->t_infomask & HEAP_MOVED)); \ + if ((tup)->t_infomask & HEAP_XMIN_IS_XMAX) \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_xmax); \ + else \ + TransactionIdStore((TransactionId) (cid), &(tup)->t_cid); \ +} while (0)#define HeapTupleHeaderSetXvac(tup, xid) \ -( \ - AssertMacro((tup)->t_infomask & (HEAP_MOVED_IN | HEAP_MOVED_OFF)), \ - TransactionIdStore((xid), (TransactionId *) &((tup)->t_cmin)) \ -) +do { \ + Assert((tup)->t_infomask & HEAP_MOVED); \ + TransactionIdStore((xid), &(tup)->t_cid); \ +} while (0)/*
---------------------------(end of broadcast)---------------------------
TIP 3: if posting/reading through Usenet, please send an appropriate
subscribe-nomail command to majordomo@postgresql.org so that your
message can get through to the mailing list cleanly
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Your patch has been added to the PostgreSQL unapplied patches list at:
http://candle.pha.pa.us/cgi-bin/pgpatches
I will try to apply it within the next 48 hours.
Are you planning to ignore my objections to it?
regards, tom lane
Tom Lane wrote:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Your patch has been added to the PostgreSQL unapplied patches list at:
http://candle.pha.pa.us/cgi-bin/pgpatches
I will try to apply it within the next 48 hours.Are you planning to ignore my objections to it?
The author replied addressing your objections and I saw no reply from on
on that.
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom Lane wrote:
Are you planning to ignore my objections to it?
The author replied addressing your objections and I saw no reply from on
on that.
He replied stating his opinion; my opinion didn't change. I was waiting
for some other people to weigh in with their opinions. As far as I've
seen, no one else has commented at all.
If I get voted down on the point after suitable discussion, so be it.
But I will strongly object to you applying the patch just because it's
next in your inbox.
regards, tom lane
Tom Lane wrote:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom Lane wrote:
Are you planning to ignore my objections to it?
The author replied addressing your objections and I saw no reply from on
on that.He replied stating his opinion; my opinion didn't change. I was waiting
for some other people to weigh in with their opinions. As far as I've
seen, no one else has commented at all.If I get voted down on the point after suitable discussion, so be it.
But I will strongly object to you applying the patch just because it's
next in your inbox.
Well, we have three votes, yours, the author, and mine. That's a vote.
If you want to make a pitch for more votes, go ahead. I can wait.
The thread went on for quite a while and no one else gave an opinion, as
I remember, though there may have been a few positive ones for the patch
that I forgot about.
I don't care if you object, strongly object, or jump up and down; you
have one vote. Please stop the posturing.
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Tom Lane wrote:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom Lane wrote:
Are you planning to ignore my objections to it?
The author replied addressing your objections and I saw no reply from on
on that.He replied stating his opinion; my opinion didn't change. I was waiting
for some other people to weigh in with their opinions. As far as I've
seen, no one else has commented at all.If I get voted down on the point after suitable discussion, so be it.
But I will strongly object to you applying the patch just because it's
next in your inbox.
Tom, I have reviewed your objections:
As I commented before, I am not in favor of this. I don't think that a
four-byte savings justifies a forced initdb with no chance of
pg_upgrade, plus loss of redundancy (= reduced chance of detecting or
recovering from corruption), plus significantly slower access to
several critical header fields. The tqual.c routines are already
hotspots in many scenarios. I believe this will make them noticeably
slower.
I don't think enough people use pg_upgrade to make it a reason to keep
an extra four bytes of tuple overhead. I realize 8-byte aligned systems
don't benefit, but most of our platforms are 4-byte aligned. I don't
consider redundency a valid reason either. We just don't have many
table corruption complaints, and the odds that having an extra 4 bytes
is going to make detection or correction better is unlikely.
The author addressed the slowness complaint and seemed to refute the
idea it would be slower.
Is there something I am missing?
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian wrote:
Tom Lane wrote:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom Lane wrote:
Are you planning to ignore my objections to it?
The author replied addressing your objections and I saw no reply from on
on that.He replied stating his opinion; my opinion didn't change. I was waiting
for some other people to weigh in with their opinions. As far as I've
seen, no one else has commented at all.If I get voted down on the point after suitable discussion, so be it.
But I will strongly object to you applying the patch just because it's
next in your inbox.Tom, I have reviewed your objections:
As I commented before, I am not in favor of this. I don't think that a
four-byte savings justifies a forced initdb with no chance of
pg_upgrade, plus loss of redundancy (= reduced chance of detecting or
recovering from corruption), plus significantly slower access to
several critical header fields. The tqual.c routines are already
hotspots in many scenarios. I believe this will make them noticeably
slower.I don't think enough people use pg_upgrade to make it a reason to keep
an extra four bytes of tuple overhead. I realize 8-byte aligned systems
don't benefit, but most of our platforms are 4-byte aligned. I don't
consider redundency a valid reason either. We just don't have many
table corruption complaints, and the odds that having an extra 4 bytes
is going to make detection or correction better is unlikely.
The non-overwriting storage management (which is one reason why whe need
all these header fields) causes over 30 bytes of row overhead anyway. I
am with Tom here, 4 bytes per row isn't worth making the tuple header
variable length size.
The author addressed the slowness complaint and seemed to refute the
idea it would be slower.
Do we have any hard numbers on that? Is it just access to the header
fields, or do we loose the offset cacheability of all fixed size fields
at the beginning of a row? In the latter case count me into the
slowness-believer camp.
Jan
--
#======================================================================#
# It's easier to get forgiveness for being wrong than for being right. #
# Let's break this rule - forgive me. #
#================================================== JanWieck@Yahoo.com #
Jan Wieck wrote:
I don't think enough people use pg_upgrade to make it a reason to keep
an extra four bytes of tuple overhead. I realize 8-byte aligned systems
don't benefit, but most of our platforms are 4-byte aligned. I don't
consider redundency a valid reason either. We just don't have many
table corruption complaints, and the odds that having an extra 4 bytes
is going to make detection or correction better is unlikely.The non-overwriting storage management (which is one reason why whe need
all these header fields) causes over 30 bytes of row overhead anyway. I
am with Tom here, 4 bytes per row isn't worth making the tuple header
variable length size.
Woh, I didn't see anything about making the header variable size. The
issue was that on 8-byte machines, structure alignment will not allow
any savings. However, on 4-byte machines, it will be a savings of ~11%
in the tuple header.
The author addressed the slowness complaint and seemed to refute the
idea it would be slower.Do we have any hard numbers on that? Is it just access to the header
fields, or do we loose the offset cacheability of all fixed size fields
at the beginning of a row? In the latter case count me into the
slowness-believer camp.
No other slowdown except access to the tuple header requires a little
more smarts. As the author mentions, the increased number of tuples per
page more than offset that. In fact, the patch is fairly small, so you
can review it yourself:
http://candle.pha.pa.us/cgi-bin/pgpatches
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Jan Wieck wrote:
I don't think enough people use pg_upgrade to make it a reason to keep
an extra four bytes of tuple overhead. I realize 8-byte aligned systems
don't benefit, but most of our platforms are 4-byte aligned. I don't
consider redundency a valid reason either. We just don't have many
table corruption complaints, and the odds that having an extra 4 bytes
is going to make detection or correction better is unlikely.The non-overwriting storage management (which is one reason why whe need
all these header fields) causes over 30 bytes of row overhead anyway. I
am with Tom here, 4 bytes per row isn't worth making the tuple header
variable length size.The author addressed the slowness complaint and seemed to refute the
idea it would be slower.Do we have any hard numbers on that? Is it just access to the header
fields, or do we loose the offset cacheability of all fixed size fields
at the beginning of a row? In the latter case count me into the
slowness-believer camp.
Here is a summary of the concepts used in the patch:
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom, I have reviewed your objections:
Thanks.
Is there something I am missing?
My principal objection to this is that I do not like piecemeal breakage
of pg_upgrade, disk page examination tools, etc. There are other things
that have been proposed that would require changes of page header or
tuple header format, and I would prefer to see them bundled up and done
as one big change (in some future revision; it's probably too late for
7.3). "Disk format of the week" is not an idea that appeals to me.
I know as well as you do that pg_upgrade compatibility is only
interesting if there *is* a pg_upgrade, which very possibly won't
happen for 7.3 anyway --- but I don't like foreclosing the possibility
for a marginal win. And this is definitely a marginal win. Let's
try to batch up enough changes to make it worth the pain.
In case you are wondering what I am talking about, here is an
off-the-cuff list of things that I'd prefer not to do one at a time:
* Version identifier words in page headers
* CRCs in page headers
* Replication and/or point-in-time recovery might need additional
header fields similar to those used by WAL
* Restructuring of line pointers
* Making OIDs optional (no wasted space if no OID)
* Adding a tuple version identifier to tuples (for DROP COLUMN etc)
* Restructuring index tuple headers (remove length field)
* Fixing array storage to support NULLs inside arrays
* Restoring time travel on some optional basis
Some of these may never happen (I'm not even in favor of all of them)
but it's certain that we will want to do some of them. I don't want to
take the same hit over and over when some intelligent project management
would let us take it just once or twice.
regards, tom lane
Jan Wieck <JanWieck@Yahoo.com> writes:
Do we have any hard numbers on that? Is it just access to the header
fields, or do we loose the offset cacheability of all fixed size fields
at the beginning of a row? In the latter case count me into the
slowness-believer camp.
I don't believe the patch would have made the header variable size,
only changed what the fixed size is. The slowdown I was worried about
was just a matter of a couple extra tests and branches in the tqual.c
routines; which would be negligible if they weren't such hotspots.
regards, tom lane
Bruce Momjian wrote:
Jan Wieck wrote:
I don't think enough people use pg_upgrade to make it a reason to keep
an extra four bytes of tuple overhead. I realize 8-byte aligned systems
don't benefit, but most of our platforms are 4-byte aligned. I don't
consider redundency a valid reason either. We just don't have many
table corruption complaints, and the odds that having an extra 4 bytes
is going to make detection or correction better is unlikely.The non-overwriting storage management (which is one reason why whe need
all these header fields) causes over 30 bytes of row overhead anyway. I
am with Tom here, 4 bytes per row isn't worth making the tuple header
variable length size.Woh, I didn't see anything about making the header variable size. The
issue was that on 8-byte machines, structure alignment will not allow
any savings. However, on 4-byte machines, it will be a savings of ~11%
in the tuple header.
You're right. Dunno where I got that idea from.
Looking at the patch I find it quite confusing using Xmin as Xmax,
sometimes. If we use 3 physical variables for 5 virtual ones in that
way, I would rather use generic names.
Jan
--
#======================================================================#
# It's easier to get forgiveness for being wrong than for being right. #
# Let's break this rule - forgive me. #
#================================================== JanWieck@Yahoo.com #
Tom Lane wrote:
Jan Wieck <JanWieck@Yahoo.com> writes:
Do we have any hard numbers on that? Is it just access to the header
fields, or do we loose the offset cacheability of all fixed size fields
at the beginning of a row? In the latter case count me into the
slowness-believer camp.I don't believe the patch would have made the header variable size,
only changed what the fixed size is. The slowdown I was worried about
was just a matter of a couple extra tests and branches in the tqual.c
routines; which would be negligible if they weren't such hotspots.
Did someone run at least pgbench with/without that patch applied?
Jan
--
#======================================================================#
# It's easier to get forgiveness for being wrong than for being right. #
# Let's break this rule - forgive me. #
#================================================== JanWieck@Yahoo.com #
Tom Lane wrote:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom, I have reviewed your objections:
Thanks.
Is there something I am missing?
My principal objection to this is that I do not like piecemeal breakage
of pg_upgrade, disk page examination tools, etc. There are other things
What do we have, pg_upgrade and Red Hat's disk dump tool, right? (I
forgot the name.)
that have been proposed that would require changes of page header or
tuple header format, and I would prefer to see them bundled up and done
as one big change (in some future revision; it's probably too late for
7.3). "Disk format of the week" is not an idea that appeals to me.
This is part of the "do it perfect or don't do it" logic that I usually
disagree with. We all agree we have a tuple header that is too large.
Here we have a chance to reduce it by 11% on most platforms, and as a
downside, we will not have a working pg_upgrade. What do you think the
average user will choose?
I know as well as you do that pg_upgrade compatibility is only
interesting if there *is* a pg_upgrade, which very possibly won't
happen for 7.3 anyway --- but I don't like foreclosing the possibility
I am not sure what needs to be done for pg_upgrade for 7.3 (does anyone
else?), but if you suspect it will not work anyway, why are you trying
to save it for 7.3? (I think the problem with pg_upgrade in 7.2 was the
inability to create pg_clog files to match the new transaction counter.)
for a marginal win. And this is definitely a marginal win. Let's
try to batch up enough changes to make it worth the pain.
Marginal? Seems like a pretty concrete win to me in disk space.
Also, he has offered to write a conversion tool. If he does that, maybe
he can improve pg_upgrade if needed.
In case you are wondering what I am talking about, here is an
off-the-cuff list of things that I'd prefer not to do one at a time:* Version identifier words in page headers
Yes, we need that, and in fact should do that in 7.3 if we accept this
patch. This may make future upgrades easier. In fact, I wonder if we
could place the page version number in such a way that old pre-7.3 pages
could be easily identified, i.e. place version number 0xAE in an offset
that used to hold a values that were always less than 0x80.
* CRCs in page headers
* Replication and/or point-in-time recovery might need additional
header fields similar to those used by WAL
* Restructuring of line pointers
* Making OIDs optional (no wasted space if no OID)
* Adding a tuple version identifier to tuples (for DROP COLUMN etc)
* Restructuring index tuple headers (remove length field)
* Fixing array storage to support NULLs inside arrays
* Restoring time travel on some optional basisSome of these may never happen (I'm not even in favor of all of them)
but it's certain that we will want to do some of them. I don't want to
take the same hit over and over when some intelligent project management
would let us take it just once or twice.
Yes, there are some good ones there, but the idea that somehow they are
all going to hit in the same release seems unlikely. I say let's do
some now, some later, and move ahead.
Ultimately, I think we need to add smarts to the backend to read old
formats. I know it is a pain, but I see no other options. pg_upgrade
always will have trouble with certain changes.
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Jan Wieck wrote:
Tom Lane wrote:
Jan Wieck <JanWieck@Yahoo.com> writes:
Do we have any hard numbers on that? Is it just access to the header
fields, or do we loose the offset cacheability of all fixed size fields
at the beginning of a row? In the latter case count me into the
slowness-believer camp.I don't believe the patch would have made the header variable size,
only changed what the fixed size is. The slowdown I was worried about
was just a matter of a couple extra tests and branches in the tqual.c
routines; which would be negligible if they weren't such hotspots.Did someone run at least pgbench with/without that patch applied?
No, but he did perform this analysis:
thus reducing the additional cost to one t_infomask compare,
because the Satisfies functions only access Cmin and Cmax,
when HEAP_MOVED is known to be not set.OTOH experimenting with a moderatly sized "out of production"
database I got the following results:
| pages | pages |
relkind | count | tuples | before| after | savings
--------+-------+--------+-------+-------+--------
i | 31 | 808146 | 8330 | 8330 | 0.00%
r | 32 | 612968 | 13572 | 13184 | 2.86%
all | 63 | | 21902 | 21514 | 1.77%2.86% fewer heap pages mean 2.86% less disk IO caused by heap pages.
Considering that index pages tend to benefit more from caching
we conclude that heap pages contribute more to the overall
IO load, so the total savings in the number of disk IOs should
be better than the 1.77% shown in the table above. I think
this outweighs a few CPU cycles now and then.
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom Lane wrote:
Some of these may never happen (I'm not even in favor of all of them)
but it's certain that we will want to do some of them. I don't want to
take the same hit over and over when some intelligent project management
would let us take it just once or twice.
Yes, there are some good ones there, but the idea that somehow they are
all going to hit in the same release seems unlikely. I say let's do
some now, some later, and move ahead.
I think it's very likely that P-I-T recovery and replication will hit
in the 7.4 release cycle. I would prefer to see us plan ahead to do
these other changes for 7.4 as well, and get as many of them done in
that cycle as we can.
Basically my point is that there are costs and benefits to this sort
of change, and many of the costs are quantized --- it doesn't cost
more to make three incompatible disk-format changes than one, *as long
as they're in the same release*. So we should try to do some actual
management of such changes, not accept them willy-nilly whenever someone
feels like doing one.
This patch is unlikely to suffer significant bit-rot if we hold it for
7.4, and that's what I think we should do with it.
regards, tom lane
Tom Lane wrote:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
Tom Lane wrote:
Some of these may never happen (I'm not even in favor of all of them)
but it's certain that we will want to do some of them. I don't want to
take the same hit over and over when some intelligent project management
would let us take it just once or twice.Yes, there are some good ones there, but the idea that somehow they are
all going to hit in the same release seems unlikely. I say let's do
some now, some later, and move ahead.I think it's very likely that P-I-T recovery and replication will hit
in the 7.4 release cycle. I would prefer to see us plan ahead to do
these other changes for 7.4 as well, and get as many of them done in
that cycle as we can.Basically my point is that there are costs and benefits to this sort
of change, and many of the costs are quantized --- it doesn't cost
more to make three incompatible disk-format changes than one, *as long
as they're in the same release*. So we should try to do some actual
management of such changes, not accept them willy-nilly whenever someone
feels like doing one.This patch is unlikely to suffer significant bit-rot if we hold it for
7.4, and that's what I think we should do with it.
Well, that is a different argument than initially. So, it is a valid
patch, but we have to decide when to apply it.
We can easily hold it until we near release of 7.3. If pg_upgrade is in
good shape _and_ no other format changes are required, we can hold it
for 7.4. What happens if 7.4 doesn't have any format changes?
However, if we have other changes or pg_upgrade isn't going to work, we
can apply it in August.
(Manfred, you can be sure I will not lose this patch.)
However, we have been very bad at predicting what features/changes are
coming in future releases. Also, I don't think replication will require
tuple format changes. I will check with the group but I haven't heard
anything about that.
So, we have to decide if we apply it now or delay it for later in 7.3,
or for >=7.4.
My personal vote is that we apply it now, and perhaps try some of the
other format changes we were going to make. Tom's vote is to hold it, I
assume. Let's see what others say.
--
Bruce Momjian | http://candle.pha.pa.us
pgman@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026