From b396680f3b37285aa72bb5fccc8bbfeb47811306 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@otacoo.com>
Date: Wed, 2 Jul 2014 14:20:08 +0900
Subject: [PATCH 3/3] Buffer capture facility: check WAL replay consistency

It is a tool aimed to be used by developers and buildfarm machines that
can be used to check for consistency at page level when replaying WAL
files among several nodes of a cluster (generally master and standby
node).

This facility is made of two parts:
- A server part, where all the changes happening at page level are
captured and inserted in a file called buffer_captures located at the
root of PGDATA. Each buffer entry is masked to make the comparison
across node consistent (flags like hint bits for example) and then each
buffer is captured is with the following format as a single line of
the output file:
LSN: %08X/%08X page: PAGE_IN_HEXA
Hexadecimal format makes it easier to detect differences between pages,
and format is chosen to facilitate comparison between buffer entries.
- A client part, located in contrib/buffer_capture_cmp, that can be used
to compare buffer captures between nodes.

The footprint on core code is minimal and is controlled by a CFLAGS
called BUFFER_CAPTURE that needs to be set at build time to enable the
buffer capture at server level. If this symbol is not enabled, both
server and client parts are idle and generate nothing.

Note that this facility can generate a lot of output (11G when running
regression tests, counting double when using both master and standby).

contrib/buffer_capture_cmp contains a regression test facility easing
testing with buffer captures. The user just needs to run "make check"
in this folder... There is a default set of tests saved in test-default.sh
but user is free to set up custom tests by creating a file called
test-custom.sh that can be kicked by the test facility if this file
is present instead of the defaults.
---
 contrib/Makefile                                |   1 +
 contrib/buffer_capture_cmp/.gitignore           |   9 +
 contrib/buffer_capture_cmp/Makefile             |  35 ++
 contrib/buffer_capture_cmp/README               |  33 ++
 contrib/buffer_capture_cmp/buffer_capture_cmp.c | 329 ++++++++++++++++
 contrib/buffer_capture_cmp/test-default.sh      |  14 +
 contrib/buffer_capture_cmp/test.sh              | 161 ++++++++
 src/backend/access/heap/heapam.c                |  11 +
 src/backend/storage/buffer/Makefile             |   2 +-
 src/backend/storage/buffer/bufcapt.c            | 482 ++++++++++++++++++++++++
 src/backend/storage/buffer/bufmgr.c             |  40 +-
 src/backend/storage/lmgr/lwlock.c               |   8 +
 src/backend/storage/page/bufpage.c              |   3 +
 src/include/miscadmin.h                         |  13 +
 src/include/storage/bufcapt.h                   |  65 ++++
 15 files changed, 1204 insertions(+), 2 deletions(-)
 create mode 100644 contrib/buffer_capture_cmp/.gitignore
 create mode 100644 contrib/buffer_capture_cmp/Makefile
 create mode 100644 contrib/buffer_capture_cmp/README
 create mode 100644 contrib/buffer_capture_cmp/buffer_capture_cmp.c
 create mode 100644 contrib/buffer_capture_cmp/test-default.sh
 create mode 100644 contrib/buffer_capture_cmp/test.sh
 create mode 100644 src/backend/storage/buffer/bufcapt.c
 create mode 100644 src/include/storage/bufcapt.h

diff --git a/contrib/Makefile b/contrib/Makefile
index b37d0dd..1c8e6b9 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -10,6 +10,7 @@ SUBDIRS = \
 		auto_explain	\
 		btree_gin	\
 		btree_gist	\
+		buffer_capture_cmp \
 		chkpass		\
 		citext		\
 		cube		\
diff --git a/contrib/buffer_capture_cmp/.gitignore b/contrib/buffer_capture_cmp/.gitignore
new file mode 100644
index 0000000..ecd8b78
--- /dev/null
+++ b/contrib/buffer_capture_cmp/.gitignore
@@ -0,0 +1,9 @@
+# Binary generated
+/buffer_capture_cmp
+
+# Regression tests
+/capture_differences.txt
+/tmp_check
+
+# Custom test file
+/test-custom.sh
diff --git a/contrib/buffer_capture_cmp/Makefile b/contrib/buffer_capture_cmp/Makefile
new file mode 100644
index 0000000..83f2725
--- /dev/null
+++ b/contrib/buffer_capture_cmp/Makefile
@@ -0,0 +1,35 @@
+# contrib/buffer_capture_cmp/Makefile
+
+PGFILEDESC = "buffer_capture_cmp - Comparator tool between buffer captures"
+PGAPPICON = win32
+
+PROGRAM = buffer_capture_cmp
+OBJS	= buffer_capture_cmp.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir) -DFRONTEND
+PG_LIBS = $(libpq_pgport)
+
+EXTRA_CLEAN = tmp_check/ capture_differences.txt
+
+# test.sh creates a cluster dedicated for the test
+EXTRA_REGRESS_OPTS=--use-existing
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/buffer_capture_cmp
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+# Tests can be done only when BUFFER_CAPTURE is defined
+ifneq (,$(filter -DBUFFER_CAPTURE,$(CFLAGS)))
+check: test.sh all
+	MAKE=$(MAKE) bindir=$(bindir) EXTRA_REGRESS_OPTS="$(EXTRA_REGRESS_OPTS)" $(SHELL) ./test.sh
+else
+check:
+	echo "BUFFER_CAPTURE is not defined in CFLAGS, so no tests to run"
+endif
diff --git a/contrib/buffer_capture_cmp/README b/contrib/buffer_capture_cmp/README
new file mode 100644
index 0000000..8a0a154
--- /dev/null
+++ b/contrib/buffer_capture_cmp/README
@@ -0,0 +1,33 @@
+buffer_capture_cmp
+------------------
+
+This facility already contains a set of regression tests that can be run
+by default with the following command:
+
+    make check
+
+The regression scripts contain a hook that can be used as an entry point
+to run some custom tests using this facility. Simply create in this folder
+a file called test-custom.sh and execute all the commands necessary for the
+tests. This custom script should use PGPORT as port to connect to the
+server where tests are run.
+
+Tips
+----
+
+The page images take up a lot of disk space! The PostgreSQL regression
+suite generates about 11GB - double that when the same is generated also
+in a standby.
+
+Always stop the master first, then standby. Otherwise, when you restart
+the standby, it will start WAL replay from the previous checkpoint, and
+log some page images already. Stopping the master creates a checkpoint
+record, avoiding the problem.
+
+It is possible to use pg_xlogdump to see which WAL record a page image
+corresponds to. But beware that the LSN in the page image points to the
+*end* of the WAL record, while the LSN that pg_xlogdump prints is the
+*beginning* of the WAL record. So to find which WAL record a page image
+corresponds to, find the LSN from the page image in pg_xlogdump output,
+and back off one record. (you can't just grep for the line containing
+the LSN).
diff --git a/contrib/buffer_capture_cmp/buffer_capture_cmp.c b/contrib/buffer_capture_cmp/buffer_capture_cmp.c
new file mode 100644
index 0000000..dcb1b3f
--- /dev/null
+++ b/contrib/buffer_capture_cmp/buffer_capture_cmp.c
@@ -0,0 +1,329 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffer_capture_cmp.c
+ *	  Utility tool to compare buffer captures between two nodes of
+ *	  a cluster. This utility needs to be run on servers whose code
+ *	  has been built with the symbol BUFFER_CAPTURE defined.
+ *
+ * Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    contrib/buffer_capture_cmp/buffer_capture_cmp.c
+ *
+ * Capture files that can be obtained on nodes of a cluster do not
+ * necessarily have the page images logged in the same order. For
+ * example, if a single WAL-logged operation modifies multiple pages,
+ * like an index page split, the standby might release the locks
+ * in different order than the master. Another cause is concurrent
+ * operations; writing the page images is not atomic with WAL insertion,
+ * so if two backends are running concurrently, their modifications in
+ * the image log can be interleaved in different order than their WAL
+ * records.
+ *
+ * To fix that, the lines from the capture file are read into a reorder
+ * buffer, and sorted there. Sorting the whole file would be overkill,
+ * as the lines are mostly in order already. The fixed-size reorder
+ * buffer works as long as the lines are not out-of-order by more than
+ * REORDER_BUFFER_SIZE lines.
+ */
+
+#include "postgres_fe.h"
+#include "port.h"
+
+/* Size of a single entry of the capture file */
+#define LINESZ (BLCKSZ*2 + 31)
+
+/* Line reordering stuff */
+#define REORDER_BUFFER_SIZE 1000
+
+typedef struct
+{
+	char	   *lines[REORDER_BUFFER_SIZE];
+	int 		nlines;		/* Number of lines currently in buffer */
+
+	FILE	   *fp;
+	bool		eof;		/* Has EOF been reached from this source? */
+} reorder_buffer;
+
+/*
+ * Read lines from the capture file into the reorder buffer, until the
+ * buffer is full.
+ */
+static void
+fill_reorder_buffer(reorder_buffer *buf)
+{
+	if (buf->eof)
+		return;
+
+	while (buf->nlines < REORDER_BUFFER_SIZE)
+	{
+		char *linebuf = pg_malloc(LINESZ);
+
+		if (fgets(linebuf, LINESZ, buf->fp) == NULL)
+		{
+			buf->eof = true;
+			pg_free(linebuf);
+			break;
+		}
+
+		/*
+		 * Common case, entry goes to the end. This happens for an
+		 * initialization or when buffer compares to be higher than
+		 * the last buffer in queue.
+		 */
+		if (buf->nlines == 0 ||
+			strcmp(linebuf, buf->lines[buf->nlines - 1]) >= 0)
+		{
+			buf->lines[buf->nlines] = linebuf;
+		}
+		else
+		{
+			/* Find the right place in the queue */
+			int			i;
+
+			/*
+			 * Scan all the existing buffers and find where buffer needs
+			 * to be included. We already know that the comparison result
+			 * we the last buffer in list.
+			 */
+			for (i = buf->nlines - 1; i >= 0; i--)
+			{
+				if (strcmp(linebuf, buf->lines[i]) >= 0)
+					break;
+			}
+
+			/* Place buffer correctly in the list */
+			i++;
+			memmove(&buf->lines[i + 1], &buf->lines[i],
+					(buf->nlines - i) * sizeof(char *));
+			buf->lines[i] = linebuf;
+		}
+		buf->nlines++;
+	}
+}
+
+/*
+ * Initialize a reorder buffer.
+ */
+static reorder_buffer *
+init_reorder_buffer(FILE *fp)
+{
+	reorder_buffer *buf;
+
+	buf = pg_malloc(sizeof(reorder_buffer));
+	buf->fp = fp;
+	buf->eof = false;
+	buf->nlines = 0;
+
+	fill_reorder_buffer(buf);
+
+	return buf;
+}
+
+/*
+ * Read all the lines that belong to the next WAL record from the reorder
+ * buffer.
+ */
+static int
+readrecord(reorder_buffer *buf, char **lines, uint64 *lsn)
+{
+	int			nlines;
+	uint32		line_xlogid;
+	uint32		line_xrecoff;
+	uint64		line_lsn;
+	uint64		rec_lsn;
+
+	/* Get all the lines with the same LSN */
+	for (nlines = 0; nlines < buf->nlines; nlines++)
+	{
+		/* Fetch LSN from the first 8 bytes of the buffer */
+		sscanf(buf->lines[nlines], "LSN: %08X/%08X",
+			   &line_xlogid, &line_xrecoff);
+		line_lsn = ((uint64) line_xlogid) << 32 | (uint64) line_xrecoff;
+
+		if (nlines == 0)
+			*lsn = rec_lsn = line_lsn;
+		else
+		{
+			if (line_lsn != rec_lsn)
+				break;
+		}
+	}
+
+	if (nlines == buf->nlines)
+	{
+		if (!buf->eof)
+		{
+			fprintf(stderr, "max number of lines in record reached, LSN: %X/%08X\n",
+					line_xlogid, line_xrecoff);
+			exit(1);
+		}
+	}
+
+	/* Consume the lines from the reorder buffer */
+	memcpy(lines, buf->lines, sizeof(char *) * nlines);
+	memmove(&buf->lines[0], &buf->lines[nlines],
+			sizeof(char *) * (buf->nlines - nlines));
+	buf->nlines -= nlines;
+
+	fill_reorder_buffer(buf);
+	return nlines;
+}
+
+/*
+ * Free all the given records.
+ */
+static void
+freerecord(char **lines, int nlines)
+{
+	int                     i;
+
+	for (i = 0; i < nlines; i++)
+		pg_free(lines[i]);
+}
+
+/*
+ * Print out given records.
+ */
+static void
+printrecord(char **lines, int nlines)
+{
+	int			i;
+
+	for (i = 0; i < nlines; i++)
+		printf("%s", lines[i]);
+}
+
+/*
+ * Do a direct comparison between the two given records that have the
+ * same LSN entry.
+ */
+static bool
+diffrecord(char **lines_a, int nlines_a, char **lines_b, int nlines_b)
+{
+	int i;
+
+	/* Leave if they do not have the same number of records */
+	if (nlines_a != nlines_b)
+		return true;
+
+	/*
+	 * Now do a comparison line by line. If any diffs are found at
+	 * character-level they will be printed out.
+	 */
+	for (i = 0; i < nlines_a; i++)
+	{
+		if (strcmp(lines_a[i], lines_b[i]) != 0)
+		{
+			int strlen_a = strlen(lines_a[i]);
+			int strlen_b = strlen(lines_b[i]);
+			int j;
+
+			printf("strlen_a: %d, strlen_b: %d\n",
+				   strlen_a, strlen_b);
+			for (j = 0; j < strlen_a; j++)
+			{
+				char char_a = lines_a[i][j];
+				char char_b = lines_b[i][j];
+				if (char_a != char_b)
+					printf("position: %d, char_a: %c, char_b: %c\n",
+						   j, char_a, char_b);
+			}
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void
+usage(void)
+{
+	printf("usage: postprocess-images <master's capture file> <standby's capture file>\n");
+}
+
+int
+main(int argc, char **argv)
+{
+	char	   *lines_a[REORDER_BUFFER_SIZE];
+	int			nlines_a;
+	char	   *lines_b[REORDER_BUFFER_SIZE];
+	int			nlines_b;
+	char	   *path_a, *path_b;
+	FILE	   *fp_a;
+	FILE	   *fp_b;
+	uint64		lsn_a;
+	uint64		lsn_b;
+	reorder_buffer *buf_a;
+	reorder_buffer *buf_b;
+
+	if (argc != 3)
+	{
+		usage();
+		exit(1);
+	}
+
+	/* Open first file */
+	path_a = pg_strdup(argv[1]);
+	fp_a = fopen(path_a, "rb");
+	if (fp_a == NULL)
+	{
+		fprintf(stderr, "Could not open file \"%s\"\n", path_a);
+		fprintf(stderr, "Check if server binaries are built with symbol BUFFER_CAPTURE");
+		exit(2);
+	}
+
+	/* Open second file */
+	path_b = pg_strdup(argv[2]);
+	fp_b = fopen(path_b, "rb");
+	if (fp_b == NULL)
+	{
+		fprintf(stderr, "Could not open file \"%s\"\n", path_b);
+		fprintf(stderr, "Check if server binaries are built with symbol BUFFER_CAPTURE");
+		exit(2);
+	}
+
+	/* Initialize buffers for first loop */
+	buf_a = init_reorder_buffer(fp_a);
+	buf_b = init_reorder_buffer(fp_b);
+
+	/* Read first record from both */
+	nlines_a = readrecord(buf_a, lines_a, &lsn_a);
+	nlines_b = readrecord(buf_b, lines_b, &lsn_b);
+
+	/* Do comparisons as long as there are entries */
+	while (nlines_a > 0 || nlines_b > 0)
+	{
+		/* Compare the records */
+		if (lsn_a < lsn_b || nlines_b == 0)
+		{
+			printf("Only in A:\n");
+			printrecord(lines_a, nlines_a);
+			freerecord(lines_a, nlines_a);
+			nlines_a = readrecord(buf_a, lines_a, &lsn_a);
+		}
+		else if (lsn_a > lsn_b || nlines_a == 0)
+		{
+			printf("Only in B:\n");
+			printrecord(lines_b, nlines_b);
+			freerecord(lines_b, nlines_b);
+			nlines_b = readrecord(buf_b, lines_b, &lsn_b);
+		}
+		else if (lsn_a == lsn_b)
+		{
+			if (diffrecord(lines_a, nlines_a, lines_b, nlines_b))
+			{
+				printf("Lines differ, A:\n");
+				printrecord(lines_a, nlines_a);
+				printf("B:\n");
+				printrecord(lines_b, nlines_b);
+			}
+			freerecord(lines_a, nlines_a);
+			freerecord(lines_b, nlines_b);
+			nlines_a = readrecord(buf_a, lines_a, &lsn_a);
+			nlines_b = readrecord(buf_b, lines_b, &lsn_b);
+		}
+	}
+
+	return 0;
+}
diff --git a/contrib/buffer_capture_cmp/test-default.sh b/contrib/buffer_capture_cmp/test-default.sh
new file mode 100644
index 0000000..5bec503
--- /dev/null
+++ b/contrib/buffer_capture_cmp/test-default.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Default test suite for buffer_compare_cmp
+
+# PGPORT is already set so process should refer to that when kicking tests
+
+# In order to run the regression test suite, copy this file as test-custom.sh
+# and then uncomment the following lines:
+# echo ROOT_DIR=$PWD
+# psql -c 'CREATE DATABASE regression'
+# cd ../../src/test/regress && make installcheck 2>&1 > /dev/null
+# cd $ROOT_DIR
+
+# Create a simple table
+psql -c 'CREATE TABLE aa AS SELECT generate_series(1, 10) AS a'
diff --git a/contrib/buffer_capture_cmp/test.sh b/contrib/buffer_capture_cmp/test.sh
new file mode 100644
index 0000000..d44cd56
--- /dev/null
+++ b/contrib/buffer_capture_cmp/test.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# contrib/buffer_capture_cmp/test.sh
+#
+# Test driver for buffer_capture_cmp. It does the following processing:
+#
+# 1) Initialization of a master and a standby
+# 2) Stop master, then standby
+# 3) Remove $PGDATA/buffer_capture on master and standby
+# 4) Start master, then standby
+# 5) Run custom or default series of tests
+# 6) Stop master, then standby
+# 7) Compare the buffer capture of both nodes
+# 8) The diffence file should be empty
+#
+# Portions Copyright (c) 2006-2014, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+
+# Initialize environment
+. ../../src/test/shell/init_env.sh
+
+# Adjust these paths for your environment
+TESTROOT=$PWD/tmp_check
+TEST_MASTER=$TESTROOT/data_master
+TEST_STANDBY=$TESTROOT/data_standby
+
+# Create the root folder for test data
+if [ ! -d $TESTROOT ]; then
+	mkdir -p $TESTROOT
+fi
+
+export PGDATABASE="postgres"
+
+# Set up PATH correctly
+PATH=$bindir:$PATH
+export PATH
+
+# Get port values for master node
+pg_get_test_port ../..
+PG_MASTER_PORT=$PGPORT
+
+# Enable echo so the user can see what is being executed
+set -x
+
+# Fetch file name containing buffer captures
+CAPTURE_FILE_NAME=`grep '#define BUFFER_CAPTURE_FILE' "$PG_ROOT_DIR"/src/include/storage/bufcapt.h | awk '{print $3}' | sed 's/\"//g'`
+CAPTURE_FILE_MASTER=$TEST_MASTER/$CAPTURE_FILE_NAME
+CAPTURE_FILE_STANDBY=$TEST_STANDBY/$CAPTURE_FILE_NAME
+
+# Set up the nodes, first the master
+rm -rf $TEST_MASTER
+initdb -N -A trust -D $TEST_MASTER
+
+# Custom parameters for master's postgresql.conf
+cat >> $TEST_MASTER/postgresql.conf <<EOF
+wal_level = hot_standby
+max_wal_senders = 2
+wal_keep_segments = 20
+checkpoint_segments = 50
+shared_buffers = 1MB
+log_line_prefix = 'M  %m %p '
+hot_standby = on
+autovacuum = off
+max_connections = 50
+listen_addresses = '$LISTEN_ADDRESSES'
+port = $PG_MASTER_PORT
+EOF
+
+# Accept replication connections on master
+cat >> $TEST_MASTER/pg_hba.conf <<EOF
+local replication all trust
+host replication all 127.0.0.1/32 trust
+host replication all ::1/128 trust
+EOF
+
+# Start master
+pg_ctl -w -D $TEST_MASTER start
+
+# Now the standby
+echo "Master initialized and running."
+
+# Set up standby with necessary parameters
+rm -rf $TEST_STANDBY
+
+# Base backup is taken with xlog files included
+pg_basebackup -D $TEST_STANDBY -p $PG_MASTER_PORT -x
+
+# Get a fresh port value for the standby
+pg_get_test_port ../..
+PG_STANDBY_PORT=$PGPORT
+echo standby: $PG_STANDBY_PORT master: $PG_MASTER_PORT
+echo "port = $PG_STANDBY_PORT" >> $TEST_STANDBY/postgresql.conf
+
+# Still need to set up PGPORT for subsequent tests
+PGPORT=$PG_MASTER_PORT
+export PGPORT
+
+cat > $TEST_STANDBY/recovery.conf <<EOF
+primary_conninfo='port=$PG_MASTER_PORT'
+standby_mode=on
+recovery_target_timeline='latest'
+EOF
+
+# Start standby
+pg_ctl -w -D $TEST_STANDBY start
+
+# Stop both nodes and remove the file containing the buffer captures
+# Master needs to be stopped first.
+pg_ctl -w -D $TEST_MASTER stop
+pg_ctl -w -D $TEST_STANDBY stop
+rm -rf $CAPTURE_FILE_MASTER $CAPTURE_FILE_STANDBY
+
+# Re-start the nodes
+pg_ctl -w -D $TEST_MASTER start
+pg_ctl -w -D $TEST_STANDBY start
+
+# Check the presence of custom tests and kick them in priority. If not,
+# fallback to the default tests. Tests need only to be run on the master
+# node.
+if [ -f ./test-custom.sh ]; then
+	. ./test-custom.sh
+else
+	. ./test-default.sh
+fi
+
+# Time to stop the nodes as tests have run
+pg_ctl -w -D $TEST_MASTER stop
+pg_ctl -w -D $TEST_STANDBY stop
+
+DIFF_FILE=capture_differences.txt
+
+# Now compare the buffer images
+# Disable erroring here, buffer capture file may not be present
+# if cluster has not been built with symbol BUFFER_CAPTURE
+set +e
+./buffer_capture_cmp $CAPTURE_FILE_MASTER $CAPTURE_FILE_STANDBY > $DIFF_FILE
+ERR_NUM=$?
+
+# Cover the case where capture file does not exist
+if [ $ERR_NUM == 2 ]; then
+	echo "Capture file does not exist"
+	echo "PASSED"
+	exit 0
+elif [ $ERR_NUM == 1 ]; then
+	echo "FAILED"
+	exit 1
+fi
+
+# No need to echo commands anymore
+set +x
+echo
+
+# Test passes if there are no diffs!
+if [ ! -s $DIFF_FILE ]; then
+	echo "PASSED"
+    exit 0
+else
+	echo "Diffs exist in the buffer captures"
+	echo "FAILED"
+	exit 1
+fi
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6861ae0..d43406c 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -71,6 +71,9 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
+#ifdef BUFFER_CAPTURE
+#include "storage/bufcapt.h"
+#endif
 
 /* GUC variable */
 bool		synchronize_seqscans = true;
@@ -7010,6 +7013,14 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 
 	END_CRIT_SECTION();
 
+#ifdef BUFFER_CAPTURE
+	/*
+	 * The normal mechanism that hooks into LockBuffer doesn't work for this,
+	 * because we're bypassing buffer manager.
+	 */
+	buffer_capture_write(page, blkno);
+#endif
+
 	return recptr;
 }
 
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
index 2c10fba..6ec85b0 100644
--- a/src/backend/storage/buffer/Makefile
+++ b/src/backend/storage/buffer/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o
+OBJS = buf_table.o buf_init.o bufcapt.o bufmgr.o freelist.o localbuf.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/bufcapt.c b/src/backend/storage/buffer/bufcapt.c
new file mode 100644
index 0000000..9e5dfc1
--- /dev/null
+++ b/src/backend/storage/buffer/bufcapt.c
@@ -0,0 +1,482 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufcapt.c
+ *	  Routines for buffer capture, including masking and dynamic buffer
+ *	  snapshot.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/page/bufcapt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifdef BUFFER_CAPTURE
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/gist.h"
+#include "access/gin_private.h"
+#include "access/htup_details.h"
+#include "access/spgist_private.h"
+#include "commands/sequence.h"
+#include "storage/bufcapt.h"
+#include "storage/bufmgr.h"
+#include "storage/lwlock.h"
+
+/* Marker used to mask pages consistently */
+#define MASK_MARKER				0xFF
+
+/* Support for capturing changes to pages per process */
+#define MAX_BEFORE_IMAGES		100
+
+typedef struct
+{
+	Buffer		  buffer;
+	char			content[BLCKSZ];
+} BufferImage;
+
+static BufferImage *before_images[MAX_BEFORE_IMAGES];
+static FILE *imagefp;
+static int before_images_cnt = 0;
+
+/* ----------------------------------------------------------------
+ * Masking functions.
+ *
+ * Most pages cannot be compared directly, because some parts of the
+ * page are not expected to be byte-by-byte identical. For example,
+ * hint bits or unused space in the page. The strategy is to normalize
+ * all pages by creating a mask of those bits that are not expected to
+ * match.
+ */
+static void mask_unused_space(char *page);
+static void mask_heap_page(char *page);
+static void mask_spgist_page(char *page);
+static void mask_gist_page(char *page);
+static void mask_gin_page(BlockNumber blkno, char *page);
+static void mask_sequence_page(char *page);
+static void mask_btree_page(char *page);
+
+/*
+ * Mask the unused space of a page between pd_lower and pd_upper.
+ */
+static void
+mask_unused_space(char *page)
+{
+	int		 pd_lower = ((PageHeader) page)->pd_lower;
+	int		 pd_upper = ((PageHeader) page)->pd_upper;
+	int		 pd_special = ((PageHeader) page)->pd_special;
+
+	/* Sanity check */
+	if (pd_lower > pd_upper || pd_special < pd_upper ||
+		pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ)
+	{
+		elog(ERROR, "invalid page at %X/%08X\n",
+				((PageHeader) page)->pd_lsn.xlogid,
+				((PageHeader) page)->pd_lsn.xrecoff);
+	}
+
+	memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower);
+}
+
+/*
+ * Mask a heap page
+ */
+static void
+mask_heap_page(char *page)
+{
+	OffsetNumber off;
+	PageHeader phdr = (PageHeader) page;
+
+	mask_unused_space(page);
+
+	/* Ignore prune_xid (it's like a hint-bit) */
+	phdr->pd_prune_xid = 0xFFFFFFFF;
+
+	/* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints */
+	phdr->pd_flags |= PD_PAGE_FULL | PD_HAS_FREE_LINES;
+
+	/*
+	 * Also mask the all-visible flag.
+	 *
+	 * XXX: It is unfortunate that we have to do this. If the flag is set
+	 * incorrectly, that's serious, and we would like to catch it. If the flag
+	 * is cleared incorrectly, that's serious too. But redo of HEAP_CLEAN
+	 * records don't currently set the flag, even though it is set in the
+	 * master, so we must silence failures that that causes.
+	 */
+	phdr->pd_flags |= PD_ALL_VISIBLE;
+
+	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
+	{
+		ItemId	  iid = PageGetItemId(page, off);
+		char	   *page_item;
+
+		page_item = (char *) (page + ItemIdGetOffset(iid));
+
+		/*
+		 * Ignore hint bits and command ID.
+		 */
+		if (ItemIdIsNormal(iid))
+		{
+			HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
+
+			page_htup->t_infomask =
+				HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID |
+				HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID;
+			page_htup->t_infomask |= HEAP_COMBOCID;
+			page_htup->t_choice.t_heap.t_field3.t_cid = 0xFFFFFFFF;
+		}
+
+		/*
+		 * Ignore any padding bytes after the tuple, when the length of
+		 * the item is not MAXALIGNed.
+		 */
+		if (ItemIdHasStorage(iid))
+		{
+			int	 len = ItemIdGetLength(iid);
+			int	 padlen = MAXALIGN(len) - len;
+
+			if (padlen > 0)
+				memset(page_item + len, MASK_MARKER, padlen);
+		}
+	}
+}
+
+/*
+ * Mask a SpGist page
+ */
+static void
+mask_spgist_page(char *page)
+{
+	mask_unused_space(page);
+}
+
+/*
+ * Mask a GIST page
+ */
+static void
+mask_gist_page(char *page)
+{
+	mask_unused_space(page);
+}
+
+/*
+ * Mask a Gin page
+ */
+static void
+mask_gin_page(BlockNumber blkno, char *page)
+{
+	/* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. */
+	if (blkno != 0)
+		mask_unused_space(page);
+}
+
+/*
+ * Mask a sequence page
+ */
+static void
+mask_sequence_page(char *page)
+{
+	/*
+	 * FIXME: currently, we just ignore sequence records altogether. nextval
+	 * records a different value in the WAL record than it writes to the
+	 * buffer. Ideally we would only mask out the value in the tuple.
+	 */
+	memset(page, MASK_MARKER, BLCKSZ);
+}
+
+/*
+ * Mask a btree page
+ */
+static void
+mask_btree_page(char *page)
+{
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	BTPageOpaque maskopaq = (BTPageOpaque)
+		(((char *) page) + ((PageHeader) page)->pd_special);
+
+	/*
+	 * Mark unused space before any processing. This is important as it
+	 * uses pd_lower and pd_upper that may be masked on this page
+	 * afterwards if it is a deleted page.
+	 */
+	mask_unused_space(page);
+
+	/*
+	 * Mask everything on a DELETED page.
+	 */
+	if (((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags & BTP_DELETED)
+	{
+		/* Page content, between standard page header and opaque struct */
+		memset(page + SizeOfPageHeaderData, MASK_MARKER,
+			   BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData)));
+
+		/* pd_lower and upper */
+		memset(&((PageHeader) page)->pd_lower, MASK_MARKER, sizeof(uint16));
+		memset(&((PageHeader) page)->pd_upper, MASK_MARKER, sizeof(uint16));
+	}
+	else
+	{
+		/*
+		 * Mask some line pointer bits, particularly those marked as
+		 * used on a master and unused on a standby.
+		 * XXX: This could be better with more refinement.
+		 */
+		maxoff = PageGetMaxOffsetNumber(page);
+		for (off = 1; off <= maxoff; off++)
+		{
+			ItemId iid = PageGetItemId(page, off);
+
+			if (ItemIdIsUsed(iid))
+				iid->lp_flags = LP_UNUSED;
+		}
+	}
+
+	/*
+	 * Mask BTP_HAS_GARBAGE flag. This needs to be done at the end
+	 * of process as previous masking operations could generate some
+	 * garbage.
+	 */
+	maskopaq->btpo_flags |= BTP_HAS_GARBAGE;
+}
+
+/* ----------------------------------------------------------------
+ * Buffer capture functions.
+ *
+ * Those functions can be used to memmorize the content of pages
+ * and flush them to BUFFER_CAPTURE_FILE when necessary.
+ */
+static bool
+buffer_capture_is_changed(BufferImage *img)
+{
+	Page			newcontent = BufferGetPage(img->buffer);
+	Page			oldcontent = (Page) img->content;
+
+	if (PageGetLSN(oldcontent) == PageGetLSN(newcontent))
+		return false;
+	return true;
+}
+
+/*
+ * Initialize page capture
+ */
+void
+buffer_capture_init(void)
+{
+	int				i;
+	BufferImage	   *images;
+
+	/* Initialize page image capturing */
+	images = palloc(MAX_BEFORE_IMAGES * sizeof(BufferImage));
+
+	for (i = 0; i < MAX_BEFORE_IMAGES; i++)
+		before_images[i] = &images[i];
+
+	imagefp = fopen(BUFFER_CAPTURE_FILE, "ab");
+}
+
+/*
+ * buffer_capture_reset
+ *
+ * Reset buffer captures
+ */
+void
+buffer_capture_reset(void)
+{
+	if (before_images_cnt > 0)
+		elog(LOG, "Released all buffer captures");
+	before_images_cnt = 0;
+}
+
+/*
+ * buffer_capture_write
+ *
+ * Flush to file the new content page present here after applying a
+ * mask on it.
+ */
+void
+buffer_capture_write(char *newcontent,
+					 uint32 blkno)
+{
+	XLogRecPtr	newlsn = PageGetLSN((Page) newcontent);
+	char		page[BLCKSZ];
+	uint16		tail;
+
+	/*
+	 * We need a lock to make sure that only one backend writes to the file
+	 * at a time. Abuse SyncScanLock for that - it happens to never be used
+	 * while a buffer is locked/unlocked.
+	 */
+	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
+
+	/* Copy content of page before any operation */
+	memcpy(page, newcontent, BLCKSZ);
+
+	/*
+	 * Look at the size of the special area, and the last two bytes in
+	 * it, to detect what kind of a page it is. Then call the appropriate
+	 * masking function.
+	 */
+	memcpy(&tail, &page[BLCKSZ - 2], 2);
+	if (PageGetSpecialSize(page) == 0)
+	{
+		mask_heap_page(page);
+	}
+	else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(BTPageOpaqueData)) ||
+			 PageGetSpecialSize(page) == MAXALIGN(sizeof(GISTPageOpaqueData)))
+	{
+		/*
+		 * It happens that btree and gist have the same size of special
+		 * area.
+		 */
+		if (tail == GIST_PAGE_ID)
+			mask_gist_page(page);
+		else if (tail <= MAX_BT_CYCLE_ID)
+			mask_btree_page(page);
+	}
+	else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(SpGistPageOpaqueData)) ||
+			 PageGetSpecialSize(page) == MAXALIGN(sizeof(GinPageOpaqueData)) ||
+			 PageGetSpecialSize(page) == 8)
+	{
+		/*
+		 * XXX: Page detection for sequences can be improved.
+		 */
+		if (tail == SPGIST_PAGE_ID)
+			mask_spgist_page(page);
+		else if (*((uint32 *) (page + BLCKSZ - MAXALIGN(sizeof(uint32)))) == SEQ_MAGIC)
+			mask_sequence_page(page);
+		else
+			mask_gin_page(blkno, page);
+	}
+
+	/*
+	 * First write the LSN in a constant format to facilitate comparisons
+	 * between buffer captures.
+	 */
+	fprintf(imagefp, "LSN: %08X/%08X ",
+			(uint32) (newlsn >> 32), (uint32) newlsn);
+
+	/* Then write the page contents, in hex */
+	fprintf(imagefp, "page: ");
+	{
+		char	buf[BLCKSZ * 2];
+		int     j = 0;
+		int		i;
+		for (i = 0; i < BLCKSZ; i++)
+		{
+			const char *digits = "0123456789ABCDEF";
+			uint8 byte = (uint8) page[i];
+
+			buf[j++] = digits[byte >> 4];
+			buf[j++] = digits[byte & 0x0F];
+		}
+		fwrite(buf, BLCKSZ * 2, 1, imagefp);
+	}
+	fprintf(imagefp, "\n");
+
+	/* Then the masked page in hex format */
+	fflush(imagefp);
+
+	/* Clean up */
+	LWLockRelease(SyncScanLock);
+}
+
+/*
+ * buffer_capture_remember
+ *
+ * Append a page content to the existing list of buffers to-be-captured.
+ */
+void
+buffer_capture_remember(Buffer buffer)
+{
+	BufferImage *img;
+
+	Assert(before_images_cnt < MAX_BEFORE_IMAGES);
+
+	img = before_images[before_images_cnt];
+	img->buffer = buffer;
+	memcpy(img->content, BufferGetPage(buffer), BLCKSZ);
+	before_images_cnt++;
+}
+
+/*
+ * buffer_capture_forget
+ *
+ * Forget a page image. If the page was modified, log the new contents.
+ */
+void
+buffer_capture_forget(Buffer buffer)
+{
+	int	i;
+
+	for (i = 0; i < before_images_cnt; i++)
+	{
+		BufferImage *img = before_images[i];
+
+		if (img->buffer == buffer)
+		{
+			/* If page has new content, capture it */
+			if (buffer_capture_is_changed(img))
+			{
+				Page content = BufferGetPage(img->buffer);
+				RelFileNode	rnode;
+				ForkNumber	forknum;
+				BlockNumber	blkno;
+
+				BufferGetTag(img->buffer, &rnode, &forknum, &blkno);
+				buffer_capture_write(content, blkno);
+			}
+
+			if (i != before_images_cnt)
+			{
+				/* Swap the last still-used slot with this one */
+				before_images[i] = before_images[before_images_cnt - 1];
+				before_images[before_images_cnt - 1] = img;
+			}
+
+			before_images_cnt--;
+			return;
+		}
+	}
+	elog(LOG, "could not find image for buffer %u", buffer);
+}
+
+/*
+ * buffer_capture_scan
+ *
+ * See if any of the buffers that have been memorized have changed.
+ */
+void
+buffer_capture_scan(void)
+{
+	int i;
+
+	for (i = 0; i < before_images_cnt; i++)
+	{
+		BufferImage *img = before_images[i];
+
+		/*
+		 * Print the contents of the page if it was changed. Remember the
+		 * new contents as the current image.
+		 */
+		if (buffer_capture_is_changed(img))
+		{
+			Page content = BufferGetPage(img->buffer);
+			RelFileNode	rnode;
+			ForkNumber	forknum;
+			BlockNumber	blkno;
+
+			BufferGetTag(img->buffer, &rnode, &forknum, &blkno);
+			buffer_capture_write(content, blkno);
+			memcpy(img->content, BufferGetPage(img->buffer), BLCKSZ);
+		}
+	}
+}
+
+#endif /* BUFFER_CAPTURE */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 07ea665..b1dd4ce 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -50,6 +50,9 @@
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
+#ifdef BUFFER_CAPTURE
+#include "storage/bufcapt.h"
+#endif
 
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -1708,6 +1711,10 @@ AtEOXact_Buffers(bool isCommit)
 {
 	CheckForBufferLeaks();
 
+#ifdef BUFFER_CAPTURE
+	buffer_capture_reset();
+#endif
+
 	AtEOXact_LocalBuffers(isCommit);
 }
 
@@ -1724,6 +1731,10 @@ void
 InitBufferPoolBackend(void)
 {
 	on_shmem_exit(AtProcExit_Buffers, 0);
+
+#ifdef BUFFER_CAPTURE
+	buffer_capture_init();
+#endif
 }
 
 /*
@@ -2749,6 +2760,20 @@ LockBuffer(Buffer buffer, int mode)
 
 	buf = &(BufferDescriptors[buffer - 1]);
 
+#ifdef BUFFER_CAPTURE
+	if (mode == BUFFER_LOCK_UNLOCK)
+	{
+		/*
+		 * XXX: peek into the LWLock struct to see if we're holding it in
+		 * exclusive or shared mode. This is concurrency-safe: if we're holding
+		 * it in exclusive mode, no-one else can release it. If we're holding
+		 * it in shared mode, no-one else can acquire it in exclusive mode.
+		 */
+		if (buf->content_lock->exclusive > 0)
+			buffer_capture_forget(buffer);
+	}
+#endif
+
 	if (mode == BUFFER_LOCK_UNLOCK)
 		LWLockRelease(buf->content_lock);
 	else if (mode == BUFFER_LOCK_SHARE)
@@ -2757,6 +2782,11 @@ LockBuffer(Buffer buffer, int mode)
 		LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
+
+#ifdef BUFFER_CAPTURE
+	if (mode == BUFFER_LOCK_EXCLUSIVE)
+		buffer_capture_remember(buffer);
+#endif
 }
 
 /*
@@ -2768,6 +2798,7 @@ bool
 ConditionalLockBuffer(Buffer buffer)
 {
 	volatile BufferDesc *buf;
+	bool	res;
 
 	Assert(BufferIsValid(buffer));
 	if (BufferIsLocal(buffer))
@@ -2775,7 +2806,14 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = &(BufferDescriptors[buffer - 1]);
 
-	return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+	res = LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
+
+#ifdef BUFFER_CAPTURE
+	if (res)
+		buffer_capture_remember(buffer);
+#endif
+
+	return res;
 }
 
 /*
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index a62af27..4671a55 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -41,6 +41,10 @@
 #include "storage/spin.h"
 #include "utils/memutils.h"
 
+#ifdef BUFFER_CAPTURE
+#include "storage/bufcapt.h"
+#endif
+
 #ifdef LWLOCK_STATS
 #include "utils/hsearch.h"
 #endif
@@ -1272,6 +1276,10 @@ LWLockRelease(LWLock *l)
 void
 LWLockReleaseAll(void)
 {
+#ifdef BUFFER_CAPTURE
+	buffer_capture_reset();
+#endif
+
 	while (num_held_lwlocks > 0)
 	{
 		HOLD_INTERRUPTS();		/* match the upcoming RESUME_INTERRUPTS */
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 6351a9b..6beaa15 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -21,6 +21,9 @@
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
+#ifdef BUFFER_CAPTURE
+#include "storage/bufcapt.h"
+#endif
 
 /* GUC variable */
 bool		ignore_checksum_failure = false;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index c2b786e..1ae98f7 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -116,11 +116,24 @@ do { \
 
 #define START_CRIT_SECTION()  (CritSectionCount++)
 
+#ifdef BUFFER_CAPTURE
+/* in src/backend/storage/buffer/bufcapt.c */
+void buffer_capture_scan(void);
+
+#define END_CRIT_SECTION() \
+do { \
+	Assert(CritSectionCount > 0); \
+	CritSectionCount--; \
+	if (CritSectionCount == 0) \
+		buffer_capture_scan(); \
+} while(0)
+#else
 #define END_CRIT_SECTION() \
 do { \
 	Assert(CritSectionCount > 0); \
 	CritSectionCount--; \
 } while(0)
+#endif
 
 
 /*****************************************************************************
diff --git a/src/include/storage/bufcapt.h b/src/include/storage/bufcapt.h
new file mode 100644
index 0000000..089e5a7
--- /dev/null
+++ b/src/include/storage/bufcapt.h
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufcaptr.h
+ *	  Buffer capture definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bufcapt.h
+ *
+ * About BUFFER_CAPTURE:
+ *
+ * If this symbol is defined, all page images that are logged on this
+ * server are as well flushed to BUFFER_CAPTURE_FILE. One line of the
+ * output file is used for a single page image.
+ *
+ * The page images obtained are aimed to be used with the utility tool
+ * called buffer_capture_cmp available in contrib/ and can be used to
+ * compare how WAL is replayed between master and standby nodes, helping
+ * in spotting bugs in this area. As each page is written in hexadecimal
+ * format, one single page writes BLCKSZ * 2 bytes to the capture file.
+ * Hexadecimal format makes it easier to spot differences between captures
+ * done among nodes. Be aware that this has a large impact on I/O and that
+ * it is aimed only for buildfarm and development purposes.
+ *
+ * One single page entry has the following format:
+ *	LSN: %08X/08X page: PAGE_IN_HEXA
+ *
+ * The LSN corresponds to the log sequence number to which the page image
+ * applies to, then the content of the image is added as-is. This format
+ * is chosen to facilitate comparisons between each capture entry
+ * particularly in cases where LSN increases its digit number.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFCAPT_H
+#define BUFCAPT_H
+
+#ifdef BUFFER_CAPTURE
+
+#include "postgres.h"
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/relfilenode.h"
+
+/*
+ * Output file where buffer captures are stored
+ */
+#define BUFFER_CAPTURE_FILE "buffer_captures"
+
+void buffer_capture_init(void);
+void buffer_capture_reset(void);
+void buffer_capture_write(char *newcontent,
+						  uint32 blkno);
+
+void buffer_capture_remember(Buffer buffer);
+void buffer_capture_forget(Buffer buffer);
+void buffer_capture_scan(void);
+
+#endif /* BUFFER_CAPTURE */
+
+#endif
-- 
2.0.0

