pageinspect: add tuple_data_record()

Started by James Colemanabout 7 years ago16 messages

jtc331@gmail.com

about 7 years ago

1 attachment(s)

Background:
In my usage of pageinspect for debugging or other purposes I often find it
frustrating that I don't have a way to easily view a heap page's tuple data
as actual records rather than binary data. After encountering this again
last week while doing some data recovery after an accidental delete (and
manually parsing a few int and timestamp fields with get_byte magic) I
decided to write up a patch to add a function to handle this case.

Summary:
The new function tuple_data_record() parallels the existing
tuple_data_split() function, but instead of returning a bytea array of raw
attribute heap values, it returns a row type of the relation being examined.

Status:
The attached patch applies to master and builds cleanly and passes tests
locally. It is still WIP (see TODOs below), but is (I believe) functionally
complete.

This is my first patch submission, so I encountered a few questions that
the coding style docs didn't seem to address, like whether it's preferable
to line up variable declarations by name.

After my refactor of tuple_data_split_internal() I'm considering inlining
that function (and the similar tuple_data_record_internal() function) into
tuple_data_{split,record}(). I'd appreciate feedback on this possibility.

TODO:
- Add documentation.
- Consider inlining _internal functions.

Thanks,
James Coleman

Attachments:

tuple-data-record-v1.patchapplication/octet-stream; name=tuple-data-record-v1.patchDownload

commit 89ec8b9cf336f14201d35218054d033d11717118
Author: jcoleman <jtc331@gmail.com>
Date:   Mon Oct 15 09:18:28 2018 -0400

    WIP - Add tuple_data_record() to pageinspect

diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile
index e5a581f141..a1945c5caa 100644
--- a/contrib/pageinspect/Makefile
+++ b/contrib/pageinspect/Makefile
@@ -5,7 +5,8 @@ OBJS		= rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \
 		  brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES)
 
 EXTENSION = pageinspect
-DATA =  pageinspect--1.6--1.7.sql \
+DATA =  pageinspect--1.7--1.8.sql \
+	pageinspect--1.6--1.7.sql \
 	pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
 	pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \
 	pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \
diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out
index 3fcd9fbe6d..25e4c1a9ae 100644
--- a/contrib/pageinspect/expected/page.out
+++ b/contrib/pageinspect/expected/page.out
@@ -62,6 +62,13 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi
  {"\\x01000001","\\x00020200"}
 (1 row)
 
+SELECT tuple_data_record('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
+    FROM heap_page_items(get_raw_page('test1', 0));
+ tuple_data_record 
+-------------------
+ (16777217,131584)
+(1 row)
+
 SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
  fsm_page_contents 
 -------------------
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index d96ba1e8b6..5996e260c7 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -277,33 +277,22 @@ heap_page_items(PG_FUNCTION_ARGS)
 }
 
 /*
- * tuple_data_split_internal
+ * parse_tuple_attr_info
  *
- * Split raw tuple data taken directly from a page into an array of bytea
- * elements. This routine does a lookup on NULL values and creates array
- * elements accordingly. This is a reimplementation of nocachegetattr()
- * in heaptuple.c simplified for educational purposes.
+ * Read raw tuple data and determine attribute offset, length, and null status.
+ * This is a reimplementation of nocachegetattr() in heaptuple.c simplified
+ * for educational purposes.
  */
-static Datum
-tuple_data_split_internal(Oid relid, char *tupdata,
-						  uint16 tupdata_len, uint16 t_infomask,
-						  uint16 t_infomask2, bits8 *t_bits,
-						  bool do_detoast)
+static void
+parse_tuple_attr_info(TupleDesc tupdesc, char *tupdata, uint16 tupdata_len,
+		uint16 t_infomask, uint16 t_infomask2, bits8 *t_bits,
+		bool *is_null, int *offsets, int *lengths, bool *varlen)
 {
-	ArrayBuildState *raw_attrs;
-	int			nattrs;
 	int			i;
-	int			off = 0;
-	Relation	rel;
-	TupleDesc	tupdesc;
+	int nattrs;
+	int off = 0;
 
-	/* Get tuple descriptor from relation OID */
-	rel = relation_open(relid, AccessShareLock);
-	tupdesc = RelationGetDescr(rel);
-
-	raw_attrs = initArrayResult(BYTEAOID, CurrentMemoryContext, false);
 	nattrs = tupdesc->natts;
-
 	if (nattrs < (t_infomask2 & HEAP_NATTS_MASK))
 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_CORRUPTED),
@@ -312,8 +301,7 @@ tuple_data_split_internal(Oid relid, char *tupdata,
 	for (i = 0; i < nattrs; i++)
 	{
 		Form_pg_attribute attr;
-		bool		is_null;
-		bytea	   *attr_data = NULL;
+		bool attr_isnull;
 
 		attr = TupleDescAttr(tupdesc, i);
 
@@ -324,16 +312,20 @@ tuple_data_split_internal(Oid relid, char *tupdata,
 		 * (t_infomask2 & HEAP_NATTS_MASK) should be treated as NULL.
 		 */
 		if (i >= (t_infomask2 & HEAP_NATTS_MASK))
-			is_null = true;
+			attr_isnull = true;
 		else
-			is_null = (t_infomask & HEAP_HASNULL) && att_isnull(i, t_bits);
+			attr_isnull = (t_infomask & HEAP_HASNULL) && att_isnull(i, t_bits);
 
-		if (!is_null)
+		is_null[i] = attr_isnull;
+
+		if (!attr_isnull)
 		{
-			int			len;
+			int len;
 
 			if (attr->attlen == -1)
 			{
+				if (varlen)
+					varlen[i] = true;
 				off = att_align_pointer(off, attr->attalign, -1,
 										tupdata + off);
 
@@ -362,77 +354,137 @@ tuple_data_split_internal(Oid relid, char *tupdata,
 						(errcode(ERRCODE_DATA_CORRUPTED),
 						 errmsg("unexpected end of tuple data")));
 
-			if (attr->attlen == -1 && do_detoast)
-				attr_data = DatumGetByteaPCopy(tupdata + off);
-			else
-			{
-				attr_data = (bytea *) palloc(len + VARHDRSZ);
-				SET_VARSIZE(attr_data, len + VARHDRSZ);
-				memcpy(VARDATA(attr_data), tupdata + off, len);
-			}
+			lengths[i] = len;
+			offsets[i] = off;
 
 			off = att_addlength_pointer(off, attr->attlen,
 										tupdata + off);
 		}
+	}
+
+	if (tupdata_len != off)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("end of tuple reached without looking at all its data")));
+}
+
+
+/*
+ * tuple_data_split_internal
+ *
+ * Split raw tuple data taken directly from a page into an array of bytea
+ * elements. This routine does a lookup on NULL values and creates array
+ * elements accordingly.
+ */
+static Datum
+tuple_data_split_internal(Oid relid, char *tupdata,
+			uint16 tupdata_len, uint16 t_infomask,
+			uint16 t_infomask2, bits8 *t_bits,
+			bool do_detoast)
+{
+	ArrayBuildState *raw_attrs;
+	int			nattrs;
+	int			i;
+	Relation	rel;
+	TupleDesc	tupdesc;
+	bool *isnull;
+	bool *varlen;
+	int *offsets;
+	int *lengths;
+
+	/* Get tuple descriptor from relation OID */
+	rel = relation_open(relid, AccessShareLock);
+	tupdesc = RelationGetDescr(rel);
+
+	raw_attrs = initArrayResult(BYTEAOID, CurrentMemoryContext, false);
+	nattrs = tupdesc->natts;
+	isnull = (bool *)palloc(sizeof(bool) * nattrs);
+	varlen = (bool *)palloc0(sizeof(bool) * nattrs);
+	offsets = (int *)palloc(sizeof(int) * nattrs);
+	lengths = (int *)palloc(sizeof(int) * nattrs);
+
+	parse_tuple_attr_info(tupdesc, tupdata, tupdata_len, t_infomask, t_infomask2, t_bits,
+		isnull, offsets, lengths, varlen);
+
+	/* Fetch all tuple attribute values */
+	for (i = 0; i < nattrs; i++)
+	{
+		bytea *attr_data = NULL;
+			if (varlen[i] && do_detoast)
+				attr_data = DatumGetByteaPCopy(tupdata + offsets[i]);
+			else
+			{
+				attr_data = (bytea *) palloc(lengths[i] + VARHDRSZ);
+				SET_VARSIZE(attr_data, lengths[i] + VARHDRSZ);
+				memcpy(VARDATA(attr_data), tupdata + offsets[i], lengths[i]);
+			}
 
 		raw_attrs = accumArrayResult(raw_attrs, PointerGetDatum(attr_data),
-									 is_null, BYTEAOID, CurrentMemoryContext);
+									 isnull[i], BYTEAOID, CurrentMemoryContext);
 		if (attr_data)
 			pfree(attr_data);
 	}
 
-	if (tupdata_len != off)
-		ereport(ERROR,
-				(errcode(ERRCODE_DATA_CORRUPTED),
-				 errmsg("end of tuple reached without looking at all its data")));
-
 	relation_close(rel, AccessShareLock);
 
 	return makeArrayResult(raw_attrs, CurrentMemoryContext);
 }
 
 /*
- * tuple_data_split
+ * tuple_data_record_internal
  *
- * Split raw tuple data taken directly from page into distinct elements
- * taking into account null values.
+ * Split raw tuple data taken directly from a page into record of the page's
+ * relation's row tyoe.
  */
-PG_FUNCTION_INFO_V1(tuple_data_split);
+static Datum
+tuple_data_split_internal_record(Oid relid, char *tupdata,
+			uint16 tupdata_len, uint16 t_infomask,
+			uint16 t_infomask2, bits8 *t_bits)
+{
+	int i;
+	int nattrs;
+	Relation	rel;
+	TupleDesc	tupdesc;
+	Datum *values;
+	bool *isnull;
+	int *offsets;
+	int *lengths;
+
+	/* Get tuple descriptor from relation OID */
+	rel = relation_open(relid, AccessShareLock);
+	tupdesc = RelationGetDescr(rel);
+
+	nattrs = tupdesc->natts;
+	isnull = (bool *)palloc(sizeof(bool) * nattrs);
+	offsets = (int *)palloc(sizeof(int) * nattrs);
+	lengths = (int *)palloc(sizeof(int) * nattrs);
+	values = palloc(sizeof(Datum) * nattrs);
+
+	parse_tuple_attr_info(tupdesc, tupdata, tupdata_len, t_infomask, t_infomask2, t_bits,
+		isnull, offsets, lengths, NULL);
+
+	/* Fetch all tuple attribute values */
+	for (i = 0; i < tupdesc->natts; i++)
+	{
+		values[i] = fetchatt(TupleDescAttr(tupdesc, i), tupdata + offsets[i]);
+	}
 
-Datum
-tuple_data_split(PG_FUNCTION_ARGS)
+	relation_close(rel, AccessShareLock);
+
+	return HeapTupleGetDatum(heap_form_tuple(tupdesc, values, isnull));
+}
+
+/*
+ * parse_and_verify_t_bits
+ *
+ * Convert t_bits string back to the bits8 array as represented in the
+ * tuple header and error check results.
+ */
+static bits8 *
+parse_and_verify_t_bits(char *t_bits_str, uint16 t_infomask, uint16 t_infomask2)
 {
-	Oid			relid;
-	bytea	   *raw_data;
-	uint16		t_infomask;
-	uint16		t_infomask2;
-	char	   *t_bits_str;
-	bool		do_detoast = false;
-	bits8	   *t_bits = NULL;
-	Datum		res;
-
-	relid = PG_GETARG_OID(0);
-	raw_data = PG_ARGISNULL(1) ? NULL : PG_GETARG_BYTEA_P(1);
-	t_infomask = PG_GETARG_INT16(2);
-	t_infomask2 = PG_GETARG_INT16(3);
-	t_bits_str = PG_ARGISNULL(4) ? NULL :
-		text_to_cstring(PG_GETARG_TEXT_PP(4));
-
-	if (PG_NARGS() >= 6)
-		do_detoast = PG_GETARG_BOOL(5);
-
-	if (!superuser())
-		ereport(ERROR,
-				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-				 errmsg("must be superuser to use raw page functions")));
-
-	if (!raw_data)
-		PG_RETURN_NULL();
-
-	/*
-	 * Convert t_bits string back to the bits8 array as represented in the
-	 * tuple header.
-	 */
+	bits8	*t_bits = NULL;
+
 	if (t_infomask & HEAP_HASNULL)
 	{
 		int			bits_str_len;
@@ -464,6 +516,49 @@ tuple_data_split(PG_FUNCTION_ARGS)
 							strlen(t_bits_str))));
 	}
 
+	return t_bits;
+}
+
+/*
+ * tuple_data_split
+ *
+ * Split raw tuple data taken directly from page into distinct elements
+ * taking into account null values.
+ */
+PG_FUNCTION_INFO_V1(tuple_data_split);
+
+Datum
+tuple_data_split(PG_FUNCTION_ARGS)
+{
+	Oid			relid;
+	bytea	   *raw_data;
+	uint16		t_infomask;
+	uint16		t_infomask2;
+	char	   *t_bits_str;
+	bool		do_detoast = false;
+	bits8	   *t_bits = NULL;
+	Datum		res;
+
+	relid = PG_GETARG_OID(0);
+	raw_data = PG_ARGISNULL(1) ? NULL : PG_GETARG_BYTEA_P(1);
+	t_infomask = PG_GETARG_INT16(2);
+	t_infomask2 = PG_GETARG_INT16(3);
+	t_bits_str = PG_ARGISNULL(4) ? NULL :
+		text_to_cstring(PG_GETARG_TEXT_PP(4));
+
+	if (PG_NARGS() >= 6)
+		do_detoast = PG_GETARG_BOOL(5);
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	if (!raw_data)
+		PG_RETURN_NULL();
+
+	t_bits = parse_and_verify_t_bits(t_bits_str, t_infomask, t_infomask2);
+
 	/* Split tuple data */
 	res = tuple_data_split_internal(relid, (char *) raw_data + VARHDRSZ,
 									VARSIZE(raw_data) - VARHDRSZ,
@@ -475,3 +570,43 @@ tuple_data_split(PG_FUNCTION_ARGS)
 
 	PG_RETURN_ARRAYTYPE_P(res);
 }
+
+PG_FUNCTION_INFO_V1(tuple_data_record);
+
+Datum
+tuple_data_record(PG_FUNCTION_ARGS)
+{
+	Oid	relid;
+	bytea *raw_data;
+	uint16 t_infomask;
+	uint16 t_infomask2;
+	char *t_bits_str;
+	bits8 *t_bits = NULL;
+	Datum res;
+
+	relid = PG_GETARG_OID(0);
+	raw_data = PG_ARGISNULL(1) ? NULL : PG_GETARG_BYTEA_P(1);
+	t_infomask = PG_GETARG_INT16(2);
+	t_infomask2 = PG_GETARG_INT16(3);
+	t_bits_str = PG_ARGISNULL(4) ? NULL :
+		text_to_cstring(PG_GETARG_TEXT_PP(4));
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	if (!raw_data)
+		PG_RETURN_NULL();
+
+	t_bits = parse_and_verify_t_bits(t_bits_str, t_infomask, t_infomask2);
+
+	res = tuple_data_split_internal_record(relid, (char *) raw_data + VARHDRSZ,
+									VARSIZE(raw_data) - VARHDRSZ,
+									t_infomask, t_infomask2, t_bits);
+
+	if (t_bits)
+		pfree(t_bits);
+
+	return res;
+}
diff --git a/contrib/pageinspect/pageinspect.control b/contrib/pageinspect/pageinspect.control
index dcfc61f22d..f8cdf526c6 100644
--- a/contrib/pageinspect/pageinspect.control
+++ b/contrib/pageinspect/pageinspect.control
@@ -1,5 +1,5 @@
 # pageinspect extension
 comment = 'inspect the contents of database pages at a low level'
-default_version = '1.7'
+default_version = '1.8'
 module_pathname = '$libdir/pageinspect'
 relocatable = true
diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql
index 8ac9991837..08fd79248a 100644
--- a/contrib/pageinspect/sql/page.sql
+++ b/contrib/pageinspect/sql/page.sql
@@ -29,6 +29,9 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_
 SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
     FROM heap_page_items(get_raw_page('test1', 0));
 
+SELECT tuple_data_record('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
+    FROM heap_page_items(get_raw_page('test1', 0));
+
 SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
 
 DROP TABLE test1;

Andres Freund

andres@anarazel.de

about 7 years ago

In reply to: James Coleman (#1)

Re: pageinspect: add tuple_data_record()

Hi,

On 2018-10-16 21:39:02 -0400, James Coleman wrote:

Background:
In my usage of pageinspect for debugging or other purposes I often find it
frustrating that I don't have a way to easily view a heap page's tuple data
as actual records rather than binary data. After encountering this again
last week while doing some data recovery after an accidental delete (and
manually parsing a few int and timestamp fields with get_byte magic) I
decided to write up a patch to add a function to handle this case.

Summary:
The new function tuple_data_record() parallels the existing
tuple_data_split() function, but instead of returning a bytea array of raw
attribute heap values, it returns a row type of the relation being examined.

I don't think it's entirely safe to do so for invisible rows. The toast
references could be reused, constraints be violated, etc. So while I
could have used this several times before, it's also very likely a good
way to cause trouble. It'd probably be ok to just fetch the binary
value of the columns, but detoasting sure ain't ok.

This is my first patch submission, so I encountered a few questions that
the coding style docs didn't seem to address, like whether it's preferable
to line up variable declarations by name.

There's ./src/tools/pgindent/pgindent (which needs an external dep), to
do that kind of thing...

Greetings,

Andres Freund

James Coleman

jtc331@gmail.com

about 7 years ago

In reply to: Andres Freund (#2)

Re: pageinspect: add tuple_data_record()

I don't think it's entirely safe to do so for invisible rows. The toast
references could be reused, constraints be violated, etc. So while I
could have used this several times before, it's also very likely a good
way to cause trouble. It'd probably be ok to just fetch the binary
value of the columns, but detoasting sure ain't ok.

Wouldn’t that be equally true for the already existing toast option to
tuple_data_split()?

Is “unsafe” here something that would lead to a crash? Or just returning
invalid data? Given that pageinspect is already clearly an internal viewing
of data, I think allowing invalid or limited data is a reasonable result.

Alternatively, would returning only inline values be safe (and say,
returning null for any toasted data)?

This is my first patch submission, so I encountered a few questions that
the coding style docs didn't seem to address, like whether it's

preferable

to line up variable declarations by name.

There's ./src/tools/pgindent/pgindent (which needs an external dep), to
do that kind of thing...

Yes, though seems like maybe the style guide could be a bit more
descriptive in some of these areas to be more friendly to newcomers. In
contrast the wiki page for submitting a patch is extremely detailed.

Thanks,
James Coleman

Peter Geoghegan

pg@bowt.ie

about 7 years ago

In reply to: James Coleman (#1)

Re: pageinspect: add tuple_data_record()

On Tue, Oct 16, 2018 at 6:39 PM James Coleman <jtc331@gmail.com> wrote:

Summary:
The new function tuple_data_record() parallels the existing tuple_data_split() function, but instead of returning a bytea array of raw attribute heap values, it returns a row type of the relation being examined.

I've been doing something similar with a modified
btreefuncs.c/bt_page_items() recently. The approach I've taken is
pretty grotty. It would be nice to have a reasonable way of doing
this, for both heap tuples and index tuples. (Relatedly,
bt_page_items() should probably have a bytea "data" field instead of a
text "data" field.)

--
Peter Geoghegan

Andres Freund

andres@anarazel.de

about 7 years ago

In reply to: James Coleman (#3)

Re: pageinspect: add tuple_data_record()

Hi,

On 2018-10-16 22:05:28 -0400, James Coleman wrote:

I don't think it's entirely safe to do so for invisible rows. The toast
references could be reused, constraints be violated, etc. So while I
could have used this several times before, it's also very likely a good
way to cause trouble. It'd probably be ok to just fetch the binary
value of the columns, but detoasting sure ain't ok.

Wouldn’t that be equally true for the already existing toast option to
tuple_data_split()?

Yes. Whoever added that didn't think far enough. Teodor, Nikolay?

Is “unsafe” here something that would lead to a crash? Or just returning
invalid data? Given that pageinspect is already clearly an internal viewing
of data, I think allowing invalid or limited data is a reasonable result.

Both.

Alternatively, would returning only inline values be safe (and say,
returning null for any toasted data)?

I think that'd be somewhat confusing.

I don't think it's necessarily just toast that's a problem here, that
was just an obvious point.

Yes, though seems like maybe the style guide could be a bit more
descriptive in some of these areas to be more friendly to newcomers. In
contrast the wiki page for submitting a patch is extremely detailed.

I think there's a lot in that area we should improve.

Greetings,

Andres Freund

Nikolay Shaplov

dhyan@nataraj.su

about 7 years ago

In reply to: Andres Freund (#5)

Re: pageinspect: add tuple_data_record()

В письме от 16 октября 2018 19:17:20 пользователь Andres Freund написал:

Hi,

On 2018-10-16 22:05:28 -0400, James Coleman wrote:

I don't think it's entirely safe to do so for invisible rows. The toast
references could be reused, constraints be violated, etc. So while I
could have used this several times before, it's also very likely a good
way to cause trouble. It'd probably be ok to just fetch the binary
value of the columns, but detoasting sure ain't ok.

Wouldn’t that be equally true for the already existing toast option to
tuple_data_split()?

Yes. Whoever added that didn't think far enough. Teodor, Nikolay?

I did compleatly got the question... The question is it safe to split tuple
record into array of raw bytea? It is quite safe from my point of view. We
use only data that is inside the tuple, and info from pg_catalog that
describes the tuple structure. So we are not affected if for example toast
table were cleaned by vacuum. If you try to deTOAST data when TOAST table were
already overwritten by new data, you can get some trouble...

Is “unsafe” here something that would lead to a crash? Or just returning
invalid data?

I think that you should try and see. Find a way to make postgres vacuum toast
table, and overwrite it with some new data, but keep tuple in heap page
untouched. Then try to deTOAST old data and see what happened. (Hint: it is
good to turn off compression for your data to be TOASTed, so you can just look
into toast-file in binary mode and get an idea what in it now)

======

Summary:
The new function tuple_data_record() parallels the existing
tuple_data_split() function, but instead of returning a bytea array of raw
attribute heap values, it returns a row type of the relation being
examined.

I find this approach a bit wrong. pageinspect is about viewing data in raw
mode, not about looking at the tuples that were deleted. It can be used for
this task, but it has other main purpose.

I would suggest instead of automatically detoast and uncompress actual data, I
would add functions that:

1. Checks if TOAST id is valid, and data can be fetched
2. Fetches that data and print it in raw mode
3. Says if binary data is compressed varlen data
4. Uncompress binary data and return it as another binary data
5. Convert binary data into string.

or something like that. This would be more according to the pageinspect
spirit, as I see it.

Than using these functions you can write pure sql wrapper that do what you
really want.

--
Do code for fun.

James Coleman

jtc331@gmail.com

about 7 years ago

In reply to: Nikolay Shaplov (#6)

Re: pageinspect: add tuple_data_record()

I did compleatly got the question... The question is it safe to split
tuple
record into array of raw bytea? It is quite safe from my point of view.
We
use only data that is inside the tuple, and info from pg_catalog that
describes the tuple structure. So we are not affected if for example toast
table were cleaned by vacuum. If you try to deTOAST data when TOAST table
were
already overwritten by new data, you can get some trouble...

The existing tuple_data_split() method already explicitly allows deTOASTing
data,
so if this is a problem, the problem already exists in pageinspect.

I find this approach a bit wrong. pageinspect is about viewing data in raw
mode, not about looking at the tuples that were deleted. It can be used
for
this task, but it has other main purpose.

...

Than using these functions you can write pure sql wrapper that do what you
really want.

It's actually extremely difficult to extract real data types from the raw
disk data in pure
SQL, which was the reason for this patch.

When using pageinspect to investigate the raw data behind a potential bug
or other issue
(for example I've used it in the past pre-9.6 to check the xmin/xmax
information to help
track down replication bugs (determining if the tuple information was there
but just not
visible after a vacuum or if the tuple data itself was missing). Even in
this kind of "under
the hood" inspection it's quite useful to be able to see actual data types
rather than binary
data.

In these kinds of use cases I believe that just ignoring TOASTed data is
even acceptable
because once you're already down the rabbit hole of looking at non-visible
information
you've already accepted that the data may no longer be complete or valid.

Nikolay Shaplov

dhyan@nataraj.su

about 7 years ago

In reply to: James Coleman (#7)

Re: pageinspect: add tuple_data_record()

В письме от 17 октября 2018 12:36:54 пользователь James Coleman написал:

I did compleatly got the question... The question is it safe to split
tuple
record into array of raw bytea? It is quite safe from my point of view.
We
use only data that is inside the tuple, and info from pg_catalog that
describes the tuple structure. So we are not affected if for example toast
table were cleaned by vacuum. If you try to deTOAST data when TOAST table
were
already overwritten by new data, you can get some trouble...

The existing tuple_data_split() method already explicitly allows deTOASTing
data,
so if this is a problem, the problem already exists in pageinspect.

Oh. that's true... I do not remember writing it, but it seems I did it. After
Andreas notion, I am sure this should be fixed by removing de_toast option for
tuple_data_split... O_o

--
Do code for fun.

Andres Freund

andres@anarazel.de

about 7 years ago

In reply to: James Coleman (#7)

Re: pageinspect: add tuple_data_record()

On 2018-10-17 12:36:54 -0400, James Coleman wrote:

I did compleatly got the question... The question is it safe to split
tuple
record into array of raw bytea? It is quite safe from my point of view.
We
use only data that is inside the tuple, and info from pg_catalog that
describes the tuple structure. So we are not affected if for example toast
table were cleaned by vacuum. If you try to deTOAST data when TOAST table
were
already overwritten by new data, you can get some trouble...

The existing tuple_data_split() method already explicitly allows deTOASTing
data,
so if this is a problem, the problem already exists in pageinspect.

Indeed. But I do think your approach - which means that the binary data is
actually interpreded as a datum of a specific type, drastically
increases the risk.

Greetings,

Andres Freund

#10

James Coleman

jtc331@gmail.com

about 7 years ago

In reply to: Andres Freund (#9)

Re: pageinspect: add tuple_data_record()

Indeed. But I do think your approach - which means that the binary data is

actually interpreded as a datum of a specific type, drastically
increases the risk.

Agreed.

As I noted earlier, I don't at all think deTOASTing is a must for this
function to be
valuable, just as tuple_data_split() is also valuable without deTOASTINGing.

I believe "best effort" is very reasonable in the case of a what is an
investigatory
method to begin with.

#11

Andres Freund

andres@anarazel.de

about 7 years ago

In reply to: James Coleman (#10)

Re: pageinspect: add tuple_data_record()

Hi,

On 2018-10-17 13:04:33 -0400, James Coleman wrote:

Indeed. But I do think your approach - which means that the binary data is

actually interpreded as a datum of a specific type, drastically
increases the risk.

Agreed.

As I noted earlier, I don't at all think deTOASTing is a must for this
function to be
valuable, just as tuple_data_split() is also valuable without deTOASTINGing.

I believe "best effort" is very reasonable in the case of a what is an
investigatory
method to begin with.

It's far from only toast that could be problematic here.

Greetings,

Andres Freund

#12

James Coleman

jtc331@gmail.com

about 7 years ago

In reply to: Andres Freund (#11)

Re: pageinspect: add tuple_data_record()

It's far from only toast that could be problematic here.

Do you have an example in mind? Wouldn’t the tuples have to be corrupted in
some way to have problems with being interpreted as a datum? Or are you
thinking very old tuples with a radically different structure to be causing
the problem?

#13

Andres Freund

andres@anarazel.de

about 7 years ago

In reply to: James Coleman (#12)

Re: pageinspect: add tuple_data_record()

Hi,

On 2018-10-17 13:14:17 -0400, James Coleman wrote:

It's far from only toast that could be problematic here.

Do you have an example in mind? Wouldn’t the tuples have to be corrupted in
some way to have problems with being interpreted as a datum? Or are you
thinking very old tuples with a radically different structure to be causing
the problem?

There's plenty ways it can go horribly wrong. Let's start with something
simple:

BEGIN;
ALTER TABLE ... ADD COLUMN blarg INT;
INSERT ... (blag) VALUES (132467890);
ROLLBACK;

ALTER TABLE ... ADD COLUMN blarg TEXT;

If you now read the table with your function you'll see a dead row that
will re-interpret a int datum as a text datum. Which in all likelyhood
will crash the server.

And yes, it's more likely to break with your patch, but I think the
current *split* function is dangerous too, and shouldn't have been added
without *A LOT* more attention to issues like this.

Greetings,

Andres Freund

#14

James Coleman

jtc331@gmail.com

about 7 years ago

In reply to: Andres Freund (#13)

Re: pageinspect: add tuple_data_record()

There's plenty ways it can go horribly wrong. Let's start with something
simple:

BEGIN;
ALTER TABLE ... ADD COLUMN blarg INT;
INSERT ... (blag) VALUES (132467890);
ROLLBACK;

ALTER TABLE ... ADD COLUMN blarg TEXT;

If you now read the table with your function you'll see a dead row that
will re-interpret a int datum as a text datum. Which in all likelyhood
will crash the server.

That particular case gives this result:
ERROR: number of attributes in tuple header is greater than number of
attributes in tuple descriptor

Some more extended monkeying with adding/dropping columns repeatedly
gave this result:
ERROR: unexpected end of tuple data

That error (unexpected end of tuple data) should (at least in the non-TOAST
case)
prevent the bug of reading beyond the raw tuple data in memory, which would
be
the easiest way I could imagine to cause a serious problem.

Is there a case that could crash outside of a non-primitive type that has
unsafe
data reading code?

#15

Andres Freund

andres@anarazel.de

about 7 years ago

In reply to: James Coleman (#14)

Re: pageinspect: add tuple_data_record()

On 2018-10-17 17:02:20 -0400, James Coleman wrote:

There's plenty ways it can go horribly wrong. Let's start with something
simple:

BEGIN;
ALTER TABLE ... ADD COLUMN blarg INT;
INSERT ... (blag) VALUES (132467890);
ROLLBACK;

ALTER TABLE ... ADD COLUMN blarg TEXT;

If you now read the table with your function you'll see a dead row that
will re-interpret a int datum as a text datum. Which in all likelyhood
will crash the server.

That particular case gives this result:
ERROR: number of attributes in tuple header is greater than number of
attributes in tuple descriptor

I don't see why you'd get that error, if you re-add a column (with a
different type), as I did above? Obviously the "replacement" column
addition would need to be committed.

Some more extended monkeying with adding/dropping columns repeatedly
gave this result:
ERROR: unexpected end of tuple data

That error (unexpected end of tuple data) should (at least in the non-TOAST
case)
prevent the bug of reading beyond the raw tuple data in memory, which would
be
the easiest way I could imagine to cause a serious problem.

You don't need to read beyond the end of the data. You just need to do
something like reinterpret types, where the original type looks enough
like a toast header (e.g.) to cause problems.

Is there a case that could crash outside of a non-primitive type that has
unsafe data reading code?

Just about anything that's not a fixed length type would be unsafe.

Greetings,

Andres Freund

#16

James Coleman

jtc331@gmail.com

about 7 years ago

In reply to: Andres Freund (#15)

Re: pageinspect: add tuple_data_record()

I don't see why you'd get that error, if you re-add a column (with a
different type), as I did above? Obviously the "replacement" column
addition would need to be committed.

Here's my test case:
CREATE TABLE t3(i INTEGER);

BEGIN;
ALTER TABLE t3 ADD COLUMN blarg INT;
INSERT INTO t3(blarg) VALUES (132467890);
ROLLBACK;

ALTER TABLE t3 ADD COLUMN blarg TEXT;

SELECT *, tuple_data_split(
't3'::regclass, t_data, t_infomask, t_infomask2, t_bits
) FROM heap_page_items(get_raw_page('t3', 0));

SELECT tdr.*, blarg IS NULL
FROM heap_page_items(get_raw_page('t3', 0))
LEFT JOIN LATERAL (
SELECT *
FROM tuple_data_record(
't3'::regclass, t_data, t_infomask, t_infomask2, t_bits
) AS n(i INTEGER, blarg TEXT)
) tdr ON true;

DROP TABLE t3;

and here's the output:

CREATE TABLE
BEGIN
ALTER TABLE
INSERT 0 1
ROLLBACK
ALTER TABLE
psql:test.sql:12: ERROR: unexpected end of tuple data
psql:test.sql:21: ERROR: unexpected end of tuple data
DROP TABLE

That error (unexpected end of tuple data) should (at least in the

non-TOAST

case)
prevent the bug of reading beyond the raw tuple data in memory, which

would

be
the easiest way I could imagine to cause a serious problem.

You don't need to read beyond the end of the data. You just need to do
something like reinterpret types, where the original type looks enough
like a toast header (e.g.) to cause problems.

Is there a case that could crash outside of a non-primitive type that has
unsafe data reading code?

Just about anything that's not a fixed length type would be unsafe.

Let's use the example of TEXT (that isn't TOASTed). If some bytes
are incorrectly interpreted as variable length text, then you have two
possibilities that I can see:

1. The determined length is less than the fixed type's space, in which
case we get bogus interpretation but it is not unsafe.
2. The determined length is greater than the fixed type's space. In this
case we get either bogus interpretation (of data beyond the fixed type's
space) or we get an error (like above).

Assuming bounds checking disallows reading beyond the raw tuple
data, then I still do not understand how a variable length field like
text (outside of TOAST at least) could be actually unsafe. It obviously
could give bizarre data.