Proposal: generic WAL compression
Hackers,
a few years ago generic WAL was proposed by Alexander Korotkov
(/messages/by-id/CAPpHfdsXwZmojm6Dx+TJnpYk27kT4o7Ri6X_4OSWcByu1Rm+VA@mail.gmail.com
and was committed into PostgreSQL 9.6.
One of the generic WAL advantages is the common interface for safe
interaction with WAL for modules and extensions. The interface allows
module to register the page, then change it, and then generic WAL
generates and writes into xlog the algorithm of changing the old version
of page into the new one. In the case of crash and recovery this
algorithm may be applied.
Bloom and RUM indexes use generic WAL. The issue is that the generic
algorithm of turning the old page into the new one is not optimal in the
sense of record size in xlog. Now the main idea of the approach is to
find all positions in the page where new version is different from the
original one and write these changes into xlog. It works well if the
page was rewritten completely or if only a few bytes have been changed.
Nevertheless, this approach produces too large WAL record in the case of
inserting or deleting a few bytes into the page. In this case there are
almost no position where the old version and the new one are equal, so
the record size is near the page size, but actual amount of changes in
the page is small. This is exactly what happens often in RUM indexes.
In order to overcome that issue, I would like to propose the patch,
which provides possibility to use another approach of the WAL record
construction. If another approach fails to make a short enough record,
it rolls back to the original approach. The main idea of another
approach is not to perform bytewise comparison of pages, but finding the
minimal editing distance between two pages and the corresponding editing
algorithm. In the general case, finding editing distance takes O(N^2)
time and memory. But there is also an algorithm which requires only
O(ND) time and O(D^2) memory, where D is the editing distance between
two sequences. Also for given D algorithm may show that the minimal
editing distance between two sequences is more than D in the same amount
of time and memory.
The special case of this algorithm which does not consider replace
operations is described in the paper
(http://www.xmailserver.org/diff2.pdf). The version of this algorithm
which consumes O(ND) time and O(N) memory is used in diff console
command, but for our purposes we don't need to increase the constant for
the time in order to decrease memory complexity. For RUM indexes we
usually have small enough editing distance (less than 64), so D^2 is not
too much to store.
The results of experiments:
+------------------------------+----------------+----------------+----------------+----------------+
| Name | WAL_diff(MB) | WAL_orig(MB) |
Time_diff(s) | Time_orig(s) |
+------------------------------+----------------+----------------+----------------+----------------+
| rum: make installcheck | 38.9 | 82.5 | 4.37
| 4.16 |
+------------------------------+----------------+----------------+----------------+----------------+
| bloom: make installcheck | 1.0 | 1.0 | 0.66 |
0.53 |
+------------------------------+----------------+----------------+----------------+----------------+
| test00.sql | 20.5 | 51.0 | 1.86 |
1.41 |
+------------------------------+----------------+----------------+----------------+----------------+
| test01.sql | 207.1 | 219.7 | 8.06 | 6.89
|
+------------------------------+----------------+----------------+----------------+----------------+
We can see that the patch provides a little slowdown, but compresses
generic WAL efficiently for RUM index. Also I'm going to do a few more
experiments on this patch with another data.
The patch was tested on Lubuntu 14.04, but should not contain any
platform-specific items. The patch, the files and scripts for doing the
experiments and performance tests are attached.
Oleg Ivanov
Postgres Professional
The Russian PostgreSQL Company
Attachments:
generic_xlog_diffdelta_v1.patchtext/x-patch; name=generic_xlog_diffdelta_v1.patchDownload
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index 3adbf7b..cd0ed2a 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -43,9 +43,18 @@
* a full page's worth of data.
*-------------------------------------------------------------------------
*/
-#define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
+#define FRAGMENT_HEADER_SIZE (2 * (sizeof(OffsetNumber) + \
+ sizeof(char) + sizeof(int)))
#define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
-#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
+#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE) + sizeof(bool)
+
+#define MAX_ALIGN_MISMATCHES 64
+/* MAX_ALIGN_MISMATCHES is supposed to be not greater than PG_UINT8_MAX */
+#define MIN_DELTA_DIFFERECE 12
+#define ALIGN_GAP 256
+
+#define writeToPtr(ptr, x) memcpy(ptr, &(x), sizeof(x)), ptr += sizeof(x)
+#define readFromPtr(ptr, x) memcpy(&(x), ptr, sizeof(x)), ptr += sizeof(x)
/* Struct of generic xlog data for single page */
typedef struct
@@ -71,6 +80,22 @@ struct GenericXLogState
bool isLogged;
};
+/* Describes for the region which type of delta is used in it */
+typedef enum
+{
+ DIFF_DELTA, /* diff delta with insert, remove and replace
+ * operations */
+ BASE_DELTA, /* base delta with update operations only */
+} DeltaType;
+
+/* Diff delta operations for transforming current page to target page */
+typedef enum
+{
+ DIFF_INSERT,
+ DIFF_REMOVE,
+ DIFF_REPLACE,
+} DiffDeltaOperations;
+
static void writeFragment(PageData *pageData, OffsetNumber offset,
OffsetNumber len, const char *data);
static void computeRegionDelta(PageData *pageData,
@@ -80,6 +105,32 @@ static void computeRegionDelta(PageData *pageData,
static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
static void applyPageRedo(Page page, const char *delta, Size deltaSize);
+static int alignRegions(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen);
+
+static bool computeRegionDiffDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd);
+static const char *applyPageDiffRedo(Page page, const char *delta, Size deltaSize);
+
+static void computeRegionBaseDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd);
+static const char *applyPageBaseRedo(Page page, const char *delta, Size deltaSize);
+
+static bool containsDiffDelta(PageData *pageData);
+static void downgradeDeltaToBaseFormat(PageData *pageData);
+
+/* Arrays for the alignment building and for the resulting alignments */
+static int V[MAX_ALIGN_MISMATCHES + 1][2 * MAX_ALIGN_MISMATCHES + 1];
+static int16 curRegionAligned[BLCKSZ + MAX_ALIGN_MISMATCHES];
+static int16 targetRegionAligned[BLCKSZ + MAX_ALIGN_MISMATCHES];
+
+/* Array for diff delta application */
+static char localPage[BLCKSZ];
+
/*
* Write next fragment into pageData's delta.
@@ -95,13 +146,12 @@ writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
/* Verify we have enough space */
Assert(pageData->deltaLen + sizeof(offset) +
- sizeof(length) + length <= sizeof(pageData->delta));
+ sizeof(length) + length <= MAX_DELTA_SIZE);
/* Write fragment data */
- memcpy(ptr, &offset, sizeof(offset));
- ptr += sizeof(offset);
- memcpy(ptr, &length, sizeof(length));
- ptr += sizeof(length);
+ writeToPtr(ptr, offset);
+ writeToPtr(ptr, length);
+
memcpy(ptr, data, length);
ptr += length;
@@ -111,12 +161,13 @@ writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
/*
* Compute the XLOG fragments needed to transform a region of curpage into the
* corresponding region of targetpage, and append them to pageData's delta
- * field. The region to transform runs from targetStart to targetEnd-1.
+ * field. The region to transform runs from targetStart to targetEnd-1.
* Bytes in curpage outside the range validStart to validEnd-1 should be
* considered invalid, and always overwritten with target data.
*
- * This function is a hot spot, so it's worth being as tense as possible
- * about the data-matching loops.
+ * This function tries to build diff delta first and, if it fails, uses
+ * the base delta. It also provides the header before the delta in which
+ * the type and the length of the delta are contained.
*/
static void
computeRegionDelta(PageData *pageData,
@@ -124,6 +175,60 @@ computeRegionDelta(PageData *pageData,
int targetStart, int targetEnd,
int validStart, int validEnd)
{
+ int length;
+ char header;
+ bool diff;
+ int prevDeltaLen;
+ char *ptr = pageData->delta + pageData->deltaLen;
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(header) +
+ sizeof(length) <= MAX_DELTA_SIZE);
+
+ pageData->deltaLen += sizeof(header) + sizeof(length);
+ prevDeltaLen = pageData->deltaLen;
+ diff = computeRegionDiffDelta(pageData,
+ curpage, targetpage,
+ targetStart, targetEnd,
+ validStart, validEnd);
+ /*
+ * If we succeeded to make diff delta, just set the header.
+ * Otherwise, make base delta.
+ */
+ if (diff)
+ {
+ header = DIFF_DELTA;
+ }
+ else
+ {
+ header = BASE_DELTA;
+ computeRegionBaseDelta(pageData,
+ curpage, targetpage,
+ targetStart, targetEnd,
+ validStart, validEnd);
+ }
+ length = pageData->deltaLen - prevDeltaLen;
+
+ writeToPtr(ptr, header);
+ writeToPtr(ptr, length);
+}
+
+/*
+ * Compute the XLOG fragments needed to transform a region of curpage into the
+ * corresponding region of targetpage, and append them to pageData's delta
+ * field. The region to transform runs from targetStart to targetEnd-1.
+ * Bytes in curpage outside the range validStart to validEnd-1 should be
+ * considered invalid, and always overwritten with target data.
+ *
+ * This function is a hot spot, so it's worth being as tense as possible
+ * about the data-matching loops.
+ */
+static void
+computeRegionBaseDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd)
+{
int i,
loopEnd,
fragmentBegin = -1,
@@ -222,6 +327,389 @@ computeRegionDelta(PageData *pageData,
}
/*
+ * Align curRegion and targetRegion and return the length of the alignment
+ * or -1 if the alignment with number of mismathing positions less than
+ * or equal to MAX_ALIGN_MISMATCHES does not exist.
+ * The alignment is stored in curRegionAligned and targetRegionAligned.
+ * The algorithm guarantees to find the alignment with the least possible
+ * number of mismathing positions or return that such least number is greater
+ * than MAX_ALIGN_MISMATCHES.
+ *
+ * The implemented algorithm is a modification of that was described in
+ * the paper "An O(ND) Difference Algorithm and Its Variations". We chose
+ * the algorithm which requires O(N + D^2) memory with time complexity O(ND),
+ * because it is faster than another one which requires O(N + D) memory and
+ * O(ND) time. The only modification we made is the introduction of REPLACE
+ * operations, while in the original algorithm only INSERT and REMOVE
+ * are considered.
+ */
+static int
+alignRegions(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen)
+{
+ /* Number of mismatches */
+ int m;
+
+ /* Difference between curRegion and targetRegion prefix lengths */
+ int k;
+
+ /* Curbuf and targetRegion prefix lengths */
+ int i,
+ j;
+ int resLen;
+ int numMismatches = -1;
+
+ /*
+ * V is an array to store the values of dynamic programming. The first
+ * dimension corresponds to m, and the second one is for k + m.
+ */
+ V[0][0] = 0;
+
+ /* Find the longest path with the given number of mismatches */
+ for (m = 0; m <= MAX_ALIGN_MISMATCHES; ++m)
+ {
+ /*
+ * Find the largest prefix alignment with the given number of
+ * mismatches and the given difference between curRegion and
+ * targetRegion prefix lengths.
+ */
+ for (k = -m; k <= m; ++k)
+ {
+ /* Dynamic programming recurrent step */
+ if (m > 0)
+ {
+ if (k == 0 && m == 1)
+ i = 1;
+ else if (k == -m || (k != m &&
+ V[m - 1][m - 1 + k - 1] < V[m - 1][m - 1 + k + 1]))
+ i = V[m - 1][m - 1 + k + 1];
+ else
+ i = V[m - 1][m - 1 + k - 1] + 1;
+ if (k != -m && k != m && V[m - 1][m - 1 + k] + 1 > i)
+ i = V[m - 1][m - 1 + k] + 1;
+ }
+ else
+ i = 0;
+ j = i - k;
+
+ /* Boundary checks */
+ Assert(i >= 0);
+ Assert(j >= 0);
+
+ /* Increase the prefixes while the bytes are equal */
+ while (i < curRegionLen && j < targetRegionLen &&
+ curRegion[i] == targetRegion[j])
+ i++, j++;
+
+ /*
+ * Save the largest curRegion prefix that was aligned with given
+ * number of mismatches and difference between curRegion and
+ * targetRegion prefix lengths.
+ */
+ V[m][m + k] = i;
+
+ /* If we find the alignment, save its length and break */
+ if (i == curRegionLen && j == targetRegionLen)
+ {
+ numMismatches = m;
+ break;
+ }
+ }
+ /* Break if we find an alignment */
+ if (numMismatches != -1)
+ break;
+ }
+ /* No alignment was found */
+ if (numMismatches == -1)
+ return -1;
+
+ /* Restore the reversed alignment */
+ resLen = 0;
+ while (i != 0 || j != 0)
+ {
+ /* Check cycle invariants */
+ Assert(i >= 0 && j >= 0);
+ Assert(m >= 0);
+ Assert(-m <= k && k <= m);
+
+ /* Rollback the equal bytes */
+ while (i != 0 && j != 0 && curRegion[i - 1] == targetRegion[j - 1])
+ {
+ curRegionAligned[resLen] = curRegion[--i];
+ targetRegionAligned[resLen] = targetRegion[--j];
+ resLen++;
+ }
+ Assert(i >= 0 && j >= 0);
+
+ /* Break if we reach the start point */
+ if (i == 0 && j == 0)
+ break;
+
+ /* Do the backward dynamic programming step */
+ if ((k == 0 && m == 1) ||
+ (k != -m && k != m && V[m - 1][m - 1 + k] + 1 == i))
+ {
+ curRegionAligned[resLen] = curRegion[--i];
+ targetRegionAligned[resLen] = targetRegion[--j];
+ resLen++;
+ m -= 1;
+ }
+ else if (k == -m || (k != m &&
+ V[m - 1][m - 1 + k - 1] < V[m - 1][m - 1 + k + 1]))
+ {
+ curRegionAligned[resLen] = ALIGN_GAP;
+ targetRegionAligned[resLen] = targetRegion[--j];
+ resLen++;
+ m -= 1, k += 1;
+ }
+ else
+ {
+ curRegionAligned[resLen] = curRegion[--i];
+ targetRegionAligned[resLen] = ALIGN_GAP;
+ resLen++;
+ m -= 1, k -= 1;
+ }
+ }
+ Assert(m == 0 && k == 0);
+
+ /* Reverse alignment */
+ for (i = 0, j = resLen - 1; i < j; ++i, --j)
+ {
+ int16 t;
+
+ t = curRegionAligned[i];
+ curRegionAligned[i] = curRegionAligned[j];
+ curRegionAligned[j] = t;
+ t = targetRegionAligned[i];
+ targetRegionAligned[i] = targetRegionAligned[j];
+ targetRegionAligned[j] = t;
+ }
+
+ return resLen;
+}
+
+/*
+ * Try to build a short alignment in order to produce a short diff delta.
+ * If fails, return false, otherwise return true and write the delta to
+ * pageData->delta.
+ * Also return false if the produced alignment is not much better than
+ * the alignment with DIFF_REPLACE operations only (the minimal difference
+ * is set in MIN_DELTA_DIFFERECE in order to avoid cases when diff delta
+ * is larger than base delta because of the greater header size).
+ */
+static bool
+computeRegionDiffDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd)
+{
+ char *ptr = pageData->delta + pageData->deltaLen;
+ int i,
+ j;
+ char type;
+ char len;
+ OffsetNumber start;
+ OffsetNumber tmp;
+
+ int baseAlignmentCost = 0;
+ int diffAlignmentCost = 0;
+ int alignmentLength;
+
+ alignmentLength = alignRegions(&curpage[validStart],
+ &targetpage[targetStart],
+ validEnd - validStart,
+ targetEnd - targetStart);
+
+ /* If no proper alignment was found return false */
+ if (alignmentLength < 0)
+ return false;
+
+ /* Compute the cost of found alignment */
+ for (i = 0; i < alignmentLength; ++i)
+ diffAlignmentCost += (curRegionAligned[i] != targetRegionAligned[i]);
+
+ /*
+ * The following cycle computes the cost of alignment with DIFF_REPLACE
+ * operations only (similar to as if the previous delta was used). The
+ * position is match if both regions don't contain it, or if both regions
+ * contain that position and the bytes on it are equal. Otherwise the
+ * position is mismatch with cost 1.
+ */
+ for (i = Min(validStart, targetStart); i < validEnd || i < targetEnd; ++i)
+ baseAlignmentCost += (i < validStart || i < targetStart ||
+ i >= validEnd || i >= targetEnd ||
+ curpage[i] != targetpage[i]);
+
+ /*
+ * Check whether the found alignment is much better than the one with
+ * DIFF_PERLACE operations only.
+ */
+ if (baseAlignmentCost - MIN_DELTA_DIFFERECE < diffAlignmentCost)
+ return false;
+
+ /*
+ * Translate the alignment into the set of instructions for transformation
+ * from curRegion into targetRegion, and write these instructions into
+ * pageData->delta.
+ */
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + 4 * sizeof(tmp) <= MAX_DELTA_SIZE);
+
+ /* Write start and end indexes of the buffers */
+ tmp = validStart;
+ writeToPtr(ptr, tmp);
+ tmp = validEnd;
+ writeToPtr(ptr, tmp);
+ tmp = targetStart;
+ writeToPtr(ptr, tmp);
+ tmp = targetEnd;
+ writeToPtr(ptr, tmp);
+
+ /* Transform the alignment into the set of instructions */
+ start = 0;
+ for (i = 0; i < alignmentLength; ++i)
+ {
+ /* Verify the alignment */
+ Assert(curRegionAligned[i] != ALIGN_GAP ||
+ targetRegionAligned[i] != ALIGN_GAP);
+
+ /* If the values are equal, write no instructions */
+ if (curRegionAligned[i] == targetRegionAligned[i])
+ {
+ start++;
+ continue;
+ }
+
+ if (curRegionAligned[i] == ALIGN_GAP)
+ type = DIFF_INSERT;
+ else if (targetRegionAligned[i] == ALIGN_GAP)
+ type = DIFF_REMOVE;
+ else
+ type = DIFF_REPLACE;
+
+ /* Find the end of the block of the same instructions */
+ j = i + 1;
+ while (j < alignmentLength)
+ {
+ bool sameType = false;
+
+ switch (type)
+ {
+ case DIFF_INSERT:
+ sameType = (curRegionAligned[j] == ALIGN_GAP);
+ break;
+ case DIFF_REMOVE:
+ sameType = (targetRegionAligned[j] == ALIGN_GAP);
+ break;
+ case DIFF_REPLACE:
+ sameType = (curRegionAligned[j] != ALIGN_GAP &&
+ targetRegionAligned[j] != ALIGN_GAP &&
+ curRegionAligned[j] != targetRegionAligned[j]);
+ break;
+ default:
+ elog(ERROR, "unrecognized delta operation type: %d", type);
+ break;
+ }
+ if (sameType)
+ j++;
+ else
+ break;
+ }
+ len = j - i;
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(type) +
+ sizeof(len) + sizeof(start) <= MAX_DELTA_SIZE);
+ /* Write the header for instruction */
+ writeToPtr(ptr, type);
+ writeToPtr(ptr, len);
+ writeToPtr(ptr, start);
+
+ /* Write instruction data and go to the end of the block */
+ if (type != DIFF_REMOVE)
+ {
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + len <= MAX_DELTA_SIZE);
+ while (i < j)
+ {
+ char c = targetRegionAligned[i++];
+
+ writeToPtr(ptr, c);
+ }
+ }
+ else
+ i = j;
+ i--;
+
+ /* Shift start position which shows where to apply instruction */
+ if (type != DIFF_INSERT)
+ start += len;
+ }
+
+ pageData->deltaLen = ptr - pageData->delta;
+
+ return true;
+}
+
+/*
+ * Return whether pageData->delta contains diff deltas or not.
+ */
+static bool
+containsDiffDelta(PageData *pageData)
+{
+ char *ptr = pageData->delta + sizeof(bool);
+ char *end = pageData->delta + pageData->deltaLen;
+ char header;
+ int length;
+
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ if (header == DIFF_DELTA)
+ return true;
+ ptr += length;
+ }
+ return false;
+}
+
+/*
+ * Downgrade pageData->delta to base delta format.
+ *
+ * Only base diffs are allowed to perform the transformation.
+ */
+static void
+downgradeDeltaToBaseFormat(PageData *pageData)
+{
+ char *ptr = pageData->delta + sizeof(bool);
+ char *cur = pageData->delta + sizeof(bool);
+ char *end = pageData->delta + pageData->deltaLen;
+ char header;
+ int length;
+ int newDeltaLength = 0;
+
+ /* Check whether the first byte is false */
+ Assert(!*((bool *) pageData->delta));
+
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ /* Check whether the region delta is base delta */
+ Assert(header == BASE_DELTA);
+ newDeltaLength += length;
+
+ memmove(cur, ptr, length);
+ cur += length;
+ ptr += length;
+ }
+ pageData->deltaLen = newDeltaLength;
+}
+
+/*
* Compute the XLOG delta record needed to transform curpage into targetpage,
* and store it in pageData's delta field.
*/
@@ -233,7 +721,7 @@ computeDelta(PageData *pageData, Page curpage, Page targetpage)
curLower = ((PageHeader) curpage)->pd_lower,
curUpper = ((PageHeader) curpage)->pd_upper;
- pageData->deltaLen = 0;
+ pageData->deltaLen = sizeof(bool);
/* Compute delta records for lower part of page ... */
computeRegionDelta(pageData, curpage, targetpage,
@@ -245,6 +733,15 @@ computeDelta(PageData *pageData, Page curpage, Page targetpage)
curUpper, BLCKSZ);
/*
+ * Set first byte to true if at least one of the region deltas
+ * is diff delta. Otherwise set first byte to false and downgrade all
+ * regions to base format without extra headers.
+ */
+ *((bool *) pageData->delta) = containsDiffDelta(pageData);
+ if (!*((bool *) pageData->delta))
+ downgradeDeltaToBaseFormat(pageData);
+
+ /*
* If xlog debug is enabled, then check produced delta. Result of delta
* application to curpage should be equivalent to targetpage.
*/
@@ -451,27 +948,145 @@ GenericXLogAbort(GenericXLogState *state)
/*
* Apply delta to given page image.
+ *
+ * Read blocks of instructions and apply them based on their type:
+ * BASE_DELTA or DIFF_DELTA.
*/
static void
applyPageRedo(Page page, const char *delta, Size deltaSize)
{
const char *ptr = delta;
const char *end = delta + deltaSize;
+ char header;
+ int length;
+ bool diff_delta;
+
+ /* If page delta is base delta, apply it. */
+ readFromPtr(ptr, diff_delta);
+ if (!diff_delta)
+ {
+ applyPageBaseRedo(page, ptr, end - ptr);
+ return;
+ }
+
+ /* Otherwise apply each region delta. */
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ switch (header)
+ {
+ case DIFF_DELTA:
+ ptr = applyPageDiffRedo(page, ptr, length);
+ break;
+ case BASE_DELTA:
+ ptr = applyPageBaseRedo(page, ptr, length);
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized delta type: %d",
+ header);
+ break;
+ }
+ }
+}
+
+/*
+ * Apply base delta to given page image.
+ */
+static const char *
+applyPageBaseRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
while (ptr < end)
{
OffsetNumber offset,
length;
- memcpy(&offset, ptr, sizeof(offset));
- ptr += sizeof(offset);
- memcpy(&length, ptr, sizeof(length));
- ptr += sizeof(length);
+ readFromPtr(ptr, offset);
+ readFromPtr(ptr, length);
memcpy(page + offset, ptr, length);
ptr += length;
}
+ return ptr;
+}
+
+/*
+ * Apply diff delta to given page image.
+ */
+static const char *
+applyPageDiffRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
+ OffsetNumber targetStart,
+ targetEnd;
+ OffsetNumber validStart,
+ validEnd;
+ int i,
+ j;
+ OffsetNumber start;
+
+ /* Read start and end indexes of the buffers */
+ readFromPtr(ptr, validStart);
+ readFromPtr(ptr, validEnd);
+ readFromPtr(ptr, targetStart);
+ readFromPtr(ptr, targetEnd);
+
+ /* Read and apply the instructions */
+ i = 0, j = 0;
+ while (ptr < end)
+ {
+ char type;
+ char len;
+
+ /* Read the header of the current instruction */
+ readFromPtr(ptr, type);
+ readFromPtr(ptr, len);
+ readFromPtr(ptr, start);
+
+ /* Copy the data before current instruction to buffer */
+ memcpy(&localPage[j], page + validStart + i, start - i);
+ j += start - i;
+ i = start;
+
+ /* Apply the instruction */
+ switch (type)
+ {
+ case DIFF_INSERT:
+ memcpy(&localPage[j], ptr, len);
+ ptr += len;
+ j += len;
+ break;
+ case DIFF_REMOVE:
+ i += len;
+ break;
+ case DIFF_REPLACE:
+ memcpy(&localPage[j], ptr, len);
+ i += len;
+ j += len;
+ ptr += len;
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized delta instruction type: %d",
+ type);
+ break;
+ }
+ }
+
+ /* Copy the data after the last instruction */
+ memcpy(&localPage[j], page + validStart + i, validEnd - validStart - i);
+ j += validEnd - validStart - i;
+ i = validEnd - validStart;
+
+ memcpy(page + targetStart, localPage, j);
+ return ptr;
}
/*
Oleg Ivanov <o.ivanov@postgrespro.ru> wrote:
In order to overcome that issue, I would like to propose the patch, which
provides possibility to use another approach of the WAL record
construction.
After having spent several hours reviewing this patch I dare to send the
following comments:
* writeToPtr() and readFromPtr() are applied to the existing code. I think
this is a reason for a separate diff, to be applied before the actual patch.
BTW, if you're going to do any refactoring of the existing code, I think the
"Begin" and "Start" words should be used consistently, see "fragmentBegin" vs
"validStart" in computeRegionBaseDelta().
* What's the reason for changing FRAGMENT_HEADER_SIZE ? I see no change in
the data layout that writeFragment() produces.
* alignRegions()
** typo in the prologue: "mismathing".
** "An O(ND) Difference Algorithm and Its Variations" - I think the reference
should contain more information, e.g. name of the author(s). I think I even
saw URLs to scientific papers in the PG source but I'm not 100% sure right
now.
** As for the algorithm itself: Although I didn't have enough patience (and
time) to follow it in every detail for this first review, I think I can
imagine what it is about. Yet I think reading would be easier if the
concepts of alignment and "diff delta" were depicted in the comments,
perhaps using some sort of "ASCII graphics".
** DiffDeltaOperations enumeration: the individual values should be described.
* computeRegionDiffDelta()
** Does "previous delta" in one of the comments refer to "base delta",
i.e. the delta computation w/o your patch? If so, the comment should
mention it explicitly.
** If you use Min() to initialize the for-loop, it'd be more consistent if you
also used Max() when checking the limits:
for (i = Min(validStart, targetStart); i < Max(validEnd, targetEnd); ++i)
And similarly you can simplify the body of the loop.
* computeDelta()
As the expresssion
*((bool *) pageData->delta)
is used more than once, I think a separate (bool *) variable should be used.
--
Antonin Houska
Cybertec Schönig & Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de, http://www.cybertec.at
I wonder if the algorithm could be changed to operate with units of 2 or 4
bytes (instead of 1).
Since the algorithm speed is essentially doubled/quadrupled it could yield
a better runtime/compression trade-off.
Does that make sense?
--
Arthur Silva
On Wed, Nov 1, 2017 at 12:43 AM, Oleg Ivanov <o.ivanov@postgrespro.ru>
wrote:
Show quoted text
Hackers,
a few years ago generic WAL was proposed by Alexander Korotkov (
/messages/by-id/CAPpHfdsXwZmojm6
Dx%2BTJnpYk27kT4o7Ri6X_4OSWcByu1Rm%2BVA%40mail.gmail.com#CAP
pHfdsXwZmojm6Dx+TJnpYk27kT4o7Ri6X_4OSWcByu1Rm+VA@mail.gmail.com). and was
committed into PostgreSQL 9.6.
One of the generic WAL advantages is the common interface for safe
interaction with WAL for modules and extensions. The interface allows
module to register the page, then change it, and then generic WAL generates
and writes into xlog the algorithm of changing the old version of page into
the new one. In the case of crash and recovery this algorithm may be
applied.Bloom and RUM indexes use generic WAL. The issue is that the generic
algorithm of turning the old page into the new one is not optimal in the
sense of record size in xlog. Now the main idea of the approach is to find
all positions in the page where new version is different from the original
one and write these changes into xlog. It works well if the page was
rewritten completely or if only a few bytes have been changed.
Nevertheless, this approach produces too large WAL record in the case of
inserting or deleting a few bytes into the page. In this case there are
almost no position where the old version and the new one are equal, so the
record size is near the page size, but actual amount of changes in the page
is small. This is exactly what happens often in RUM indexes.In order to overcome that issue, I would like to propose the patch, which
provides possibility to use another approach of the WAL record
construction. If another approach fails to make a short enough record, it
rolls back to the original approach. The main idea of another approach is
not to perform bytewise comparison of pages, but finding the minimal
editing distance between two pages and the corresponding editing algorithm.
In the general case, finding editing distance takes O(N^2) time and memory.
But there is also an algorithm which requires only O(ND) time and O(D^2)
memory, where D is the editing distance between two sequences. Also for
given D algorithm may show that the minimal editing distance between two
sequences is more than D in the same amount of time and memory.The special case of this algorithm which does not consider replace
operations is described in the paper (http://www.xmailserver.org/diff2.pdf).
The version of this algorithm which consumes O(ND) time and O(N) memory is
used in diff console command, but for our purposes we don't need to
increase the constant for the time in order to decrease memory complexity.
For RUM indexes we usually have small enough editing distance (less than
64), so D^2 is not too much to store.The results of experiments:
+------------------------------+----------------+----------- -----+----------------+----------------+ | Name | WAL_diff(MB) | WAL_orig(MB) | Time_diff(s) | Time_orig(s) | +------------------------------+----------------+----------- -----+----------------+----------------+ | rum: make installcheck | 38.9 | 82.5 | 4.37 | 4.16 | +------------------------------+----------------+----------- -----+----------------+----------------+ | bloom: make installcheck | 1.0 | 1.0 | 0.66 | 0.53 | +------------------------------+----------------+----------- -----+----------------+----------------+ | test00.sql | 20.5 | 51.0 | 1.86 | 1.41 | +------------------------------+----------------+----------- -----+----------------+----------------+ | test01.sql | 207.1 | 219.7 | 8.06 | 6.89 | +------------------------------+----------------+----------- -----+----------------+----------------+We can see that the patch provides a little slowdown, but compresses
generic WAL efficiently for RUM index. Also I'm going to do a few more
experiments on this patch with another data.The patch was tested on Lubuntu 14.04, but should not contain any
platform-specific items. The patch, the files and scripts for doing the
experiments and performance tests are attached.Oleg Ivanov
Postgres Professional
The Russian PostgreSQL Company--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Antonin Houska <ah@cybertec.at> wrote:
Oleg Ivanov <o.ivanov@postgrespro.ru> wrote:
In order to overcome that issue, I would like to propose the patch, which
provides possibility to use another approach of the WAL record
construction.After having spent several hours reviewing this patch I dare to send the
following comments:
One more idea:
I think the metadata (ALIGN_GAP) should be stored separate from the actual
data so that you can use memcpy() instead of this loop:
while (i < j)
{
char c = targetRegionAligned[i++];
writeToPtr(ptr, c);
}
--
Antonin Houska
Cybertec Schönig & Schönig GmbH
Gröhrmühlgasse 26
A-2700 Wiener Neustadt
Web: http://www.postgresql-support.de, http://www.cybertec.at
On Mon, Nov 20, 2017 at 4:49 PM, Antonin Houska <ah@cybertec.at> wrote:
One more idea:
I think the metadata (ALIGN_GAP) should be stored separate from the actual
data so that you can use memcpy() instead of this loop:while (i < j)
{
char c = targetRegionAligned[i++];writeToPtr(ptr, c);
}
Moved to next CF per lack of reviews. The patch still applies.
Oleg, I would recommend as well that you do more patch reviews. Let's
not forget that for one patch submitted you should review one patch
with a somewhat equal difficulty, and yours is quite specialized.
--
Michael
Oleg,
I'm not really sure why this is still in Needs Review as a review was
posted and I don't see any follow-up. I've changed this to be Waiting
for Author.
* Antonin Houska (ah@cybertec.at) wrote:
Oleg Ivanov <o.ivanov@postgrespro.ru> wrote:
In order to overcome that issue, I would like to propose the patch, which
provides possibility to use another approach of the WAL record
construction.After having spent several hours reviewing this patch I dare to send the
following comments:
Thanks for the review Antonin!
* writeToPtr() and readFromPtr() are applied to the existing code. I think
this is a reason for a separate diff, to be applied before the actual patch.
I'm not really a fan of using these, for my 2c. I'm not sure how others
feel, but having these macros which change one of the arguments to the
macro (by advancing the pointer) doesn't result in a net improvement to
readability for me.
The other review comments also seem pretty reasonable to me.
Thanks!
Stephen
Hi Oleg,
On 1/22/18 4:37 PM, Stephen Frost wrote:
Oleg,
I'm not really sure why this is still in Needs Review as a review was
posted and I don't see any follow-up. I've changed this to be Waiting
for Author.* Antonin Houska (ah@cybertec.at) wrote:
Oleg Ivanov <o.ivanov@postgrespro.ru> wrote:
In order to overcome that issue, I would like to propose the patch, which
provides possibility to use another approach of the WAL record
construction.After having spent several hours reviewing this patch I dare to send the
following comments:Thanks for the review Antonin!
* writeToPtr() and readFromPtr() are applied to the existing code. I think
this is a reason for a separate diff, to be applied before the actual patch.I'm not really a fan of using these, for my 2c. I'm not sure how others
feel, but having these macros which change one of the arguments to the
macro (by advancing the pointer) doesn't result in a net improvement to
readability for me.The other review comments also seem pretty reasonable to me.
This proposal is still in need of review and hasn't really gotten it in
the last few CFs.
Since the feature does not seem like a good fit for the last CF of PG11
I am marking it Returned with Feedback.
Regards,
--
-David
david@pgmasters.net
Hi David,
On 07/02/18 18:31, David Steele wrote:
This proposal is still in need of review and hasn't really gotten it in
the last few CFs.
Agreed.
Since the feature does not seem like a good fit for the last CF of PG11
I am marking it Returned with Feedback.
Is there any chance that it may be still resubmitted / reopened for the
March CF?
Actually right now I am in the process of reviewing it, for it would be
really useful for my "OUZO" project (a fork of the RUM access method).
One general comment I can already make is that enabling compression
should be made optional, which appears to be a small and easy addition
to the generic WAL API.
Best regards,
--
Markus Nullmeier http://www.g-vo.org
German Astrophysical Virtual Observatory (GAVO)
Greetings,
* Markus Nullmeier (dq124@uni-heidelberg.de) wrote:
On 07/02/18 18:31, David Steele wrote:
This proposal is still in need of review and hasn't really gotten it in
the last few CFs.Agreed.
Since the feature does not seem like a good fit for the last CF of PG11
I am marking it Returned with Feedback.Is there any chance that it may be still resubmitted / reopened for the
March CF?
Of course, you're certainly welcome to resubmit it to a future CF. As
we're getting close to the end of this release cycle, I tend to think it
would be appropriate to submit it for the post-11 CF rather than the one
in March.
Actually right now I am in the process of reviewing it, for it would be
really useful for my "OUZO" project (a fork of the RUM access method).
Glad to hear that you're continuing to work on it.
One general comment I can already make is that enabling compression
should be made optional, which appears to be a small and easy addition
to the generic WAL API.
The question would be- is there a way for us to determine when it should
be enabled or not enabled, or do we have to ask the user to tell us?
Asking the user is something we'd really not do unless we absolutely
have to. Much better is if we can figure out when it's best to enable
or disable (or use/not use) based on some criteria and do so
automatically.
Thanks!
Stephen
On 07/02/18 19:42, Stephen Frost wrote:
really useful for my "OUZO" project (a fork of the RUM access method).
Glad to hear that you're continuing to work on it.
Yes, it will be available on Github eventually.
One general comment I can already make is that enabling compression
should be made optional, which appears to be a small and easy addition
to the generic WAL API.The question would be- is there a way for us to determine when it should
be enabled or not enabled, or do we have to ask the user to tell us?
My suggestion would be adding something like
#define GENERIC_XLOG_MOVE_DETECT_ON 0x0002 /* search for moved
parts of page image */
#define GENERIC_XLOG_MOVE_DETECT_OFF 0x0004 /* do not search for
moved parts of page image */
This would be used for the 'flags' argument of GenericXLogRegisterBuffer(),
allowing code using generic WAL (such as custom access methods) to
control this kind of compression on a per buffer basis, if need be.
Then it would be up to the author of any extension relying on generic
WAL to decide if and how the user will be given control of WAL
compression.
However,
Asking the user is something we'd really not do unless we absolutely
have to. Much better is if we can figure out when it's best to enable
or disable (or use/not use) based on some criteria and do so
automatically.
this is a point I did not think of before :-) Probably a logical
choice would be having "GENERIC_XLOG_MOVE_DETECT" default to true
for a 'wal_level' value of 'replica' (the default setting nowadays)
or higher, and default to false for 'minimal'.
In this way, generic WAL will be automatically compressed when it is
potentially stored for some longer time, or consuming network bandwidth.
--
Markus Nullmeier http://www.g-vo.org
German Astrophysical Virtual Observatory (GAVO)
Hello everyone!
Unfortunately, I faced the use case with RUM index, in which my patch
produced
enormously large time overhead (queries execution time is about 2 or 3 times
slower). Only now I finally managed to limit this overhead by 20% or
even make
it statistically insignificant on my HDD. Nevertheless, SSD evaluation
is still
required.
So I attach new test totat_test.sh script which includes the bad RUM use
case
and the new version of the patch.
Thanks everybody for review, it improved the patch a lot. The majority
of the
listed drawbacks were fixed, the others are discussed below.
On 11/16/2017 08:31 PM, Antonin Houska wrote:
* writeToPtr() and readFromPtr() are applied to the existing code. I think
this is a reason for a separate diff, to be applied before the actual patch.
Removed the existing code refactoring, that should be a separate patch
indeed.
* What's the reason for changing FRAGMENT_HEADER_SIZE ? I see no change in
the data layout that writeFragment() produces.
Diff delta has its own header which consists of one char and one int. To
make
this point more clear, I defined DIFF_DELTA_HEADER_SIZE and added
2 * DIFF_DELTA_HEADER_SIZE to MAX_DELTA_SIZE.
** "An O(ND) Difference Algorithm and Its Variations" - I think the reference
should contain more information, e.g. name of the author(s). I think I even
saw URLs to scientific papers in the PG source but I'm not 100% sure right
now.
The reference to the paper contains much more information now.
** As for the algorithm itself: Although I didn't have enough patience (and
time) to follow it in every detail for this first review, I think I can
imagine what it is about. Yet I think reading would be easier if the
concepts of alignment and "diff delta" were depicted in the comments,
perhaps using some sort of "ASCII graphics".
I briefly described the main idea of the dynamical programming algorithm
in comments of alignRegions. It can be further refined if you think it is
still unclear now, but I'm afraid the detailed description will result
in just
copying the referred paper into the comments.
** DiffDeltaOperations enumeration: the individual values should be described.
* computeRegionDiffDelta()
** Does "previous delta" in one of the comments refer to "base delta",
i.e. the delta computation w/o your patch? If so, the comment should
mention it explicitly.** If you use Min() to initialize the for-loop, it'd be more consistent if you
also used Max() when checking the limits:for (i = Min(validStart, targetStart); i < Max(validEnd, targetEnd); ++i)
And similarly you can simplify the body of the loop.
I found that this part of code is unnecessary and deleted it.
On 11/17/2017 03:02 PM, Arthur Silva wrote:
I wonder if the algorithm could be changed to operate with units of 2
or 4 bytes (instead of 1).
Since the algorithm speed is essentially doubled/quadrupled it could
yield a better runtime/compression trade-off.Does that make sense?
This approach implies that we cannot detect data shift by one byte, for
example.
It is suitable for 2- or 4-bytes-aligned data, but looks not general enough.
On 01/23/2018 12:37 AM, Stephen Frost wrote:
* writeToPtr() and readFromPtr() are applied to the existing code. I think
this is a reason for a separate diff, to be applied before the actual patch.I'm not really a fan of using these, for my 2c. I'm not sure how others
feel, but having these macros which change one of the arguments to the
macro (by advancing the pointer) doesn't result in a net improvement to
readability for me.
Sounds reasonable, but the patch uses such constructions as writeToPtr
rather
often. How do you feel about using writeToPtr(&ptr, data) which which more
explicitly indicates the first argument changing?
On 02/07/2018 09:02 PM, Markus Nullmeier wrote:
One general comment I can already make is that enabling compression
should be made optional, which appears to be a small and easy addition
to the generic WAL API.
Now I'm working on this.I consider creation the function
GenericXLogRegisterBufferEx in addition to GenericXLogRegisterBuffer.
GenericXLogRegisterBufferEx will take a structure with parameters for diff
delta such as maximal alignment length, to which parts of page it must be
applied, etc. And GenericXLogRegisterBufferwill call
GenericXLogRegisterBufferEx with default parameters. This allows using
different compression settings for different indexes. What do you think
about
such solution?
Attachments:
generic_xlog_diffdelta_v2.patchtext/x-patch; name=generic_xlog_diffdelta_v2.patchDownload
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index 3adbf7b..2ed4146 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -44,8 +44,16 @@
*-------------------------------------------------------------------------
*/
#define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
+#define DIFF_DELTA_HEADER_SIZE (sizeof(char) + sizeof(int))
#define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
-#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
+#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE + \
+ 2 * DIFF_DELTA_HEADER_SIZE) + sizeof(bool)
+
+#define MAX_ALIGN_MISMATCHES 24
+/* MAX_ALIGN_MISMATCHES is supposed to be not greater than PG_UINT8_MAX */
+
+#define writeToPtr(ptr, x) memcpy(ptr, &(x), sizeof(x)), ptr += sizeof(x)
+#define readFromPtr(ptr, x) memcpy(&(x), ptr, sizeof(x)), ptr += sizeof(x)
/* Struct of generic xlog data for single page */
typedef struct
@@ -71,15 +79,62 @@ struct GenericXLogState
bool isLogged;
};
+/* Describes for the region which type of delta is used in it */
+typedef enum
+{
+ DIFF_DELTA = 0, /* diff delta with insert, remove and replace
+ * operations */
+ BASE_DELTA = 1 /* base delta with update operations only */
+} DeltaType;
+
+/* Diff delta operations for transforming current page to target page */
+typedef enum
+{
+ DIFF_INSERT = 0,
+ DIFF_REMOVE = 1,
+ DIFF_REPLACE = 2
+} DiffDeltaOperations;
+
static void writeFragment(PageData *pageData, OffsetNumber offset,
OffsetNumber len, const char *data);
static void computeRegionDelta(PageData *pageData,
const char *curpage, const char *targetpage,
int targetStart, int targetEnd,
- int validStart, int validEnd);
+ int validStart, int validEnd,
+ bool forceBaseDelta);
static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
static void applyPageRedo(Page page, const char *delta, Size deltaSize);
+static int alignRegions(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen);
+
+static bool computeRegionDiffDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd);
+static const char *applyPageDiffRedo(Page page, const char *delta, Size deltaSize);
+
+static void computeRegionBaseDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd);
+static const char *applyPageBaseRedo(Page page, const char *delta, Size deltaSize);
+
+static bool pageDataContainsDiffDelta(PageData *pageData);
+static void downgradeDeltaToBaseFormat(PageData *pageData);
+
+/* Arrays for the alignment building and for the resulting alignments */
+static int V[MAX_ALIGN_MISMATCHES + 1][2 * MAX_ALIGN_MISMATCHES + 1];
+static int prevDiag[MAX_ALIGN_MISMATCHES + 1][2 * MAX_ALIGN_MISMATCHES + 1];
+static int alignmentDiag[MAX_ALIGN_MISMATCHES + 1];
+static char curRegionAligned[BLCKSZ + MAX_ALIGN_MISMATCHES];
+static bool curRegionAlignedGap[BLCKSZ + MAX_ALIGN_MISMATCHES];
+static char targetRegionAligned[BLCKSZ + MAX_ALIGN_MISMATCHES];
+static bool targetRegionAlignedGap[BLCKSZ + MAX_ALIGN_MISMATCHES];
+
+/* Array for diff delta application */
+static char localPage[BLCKSZ];
+
/*
* Write next fragment into pageData's delta.
@@ -115,14 +170,76 @@ writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
* Bytes in curpage outside the range validStart to validEnd-1 should be
* considered invalid, and always overwritten with target data.
*
- * This function is a hot spot, so it's worth being as tense as possible
- * about the data-matching loops.
+ * If forceBaseDelta is true, fucntion just calls computeRegionBaseDelta.
+ * Otherwise this function tries to build diff delta first and, if it fails,
+ * uses the base delta. It also provides the header before the delta in which
+ * the type and the length of the delta are contained.
*/
static void
computeRegionDelta(PageData *pageData,
const char *curpage, const char *targetpage,
int targetStart, int targetEnd,
- int validStart, int validEnd)
+ int validStart, int validEnd,
+ bool forceBaseDelta)
+{
+ int length;
+ char header;
+ int prevDeltaLen;
+ bool diff = false;
+ char *ptr = pageData->delta + pageData->deltaLen;
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(header) +
+ sizeof(length) <= MAX_DELTA_SIZE);
+
+ pageData->deltaLen += sizeof(header) + sizeof(length);
+ prevDeltaLen = pageData->deltaLen;
+
+ /* Try building diff delta only if necessary */
+ if (!forceBaseDelta)
+ {
+ diff = computeRegionDiffDelta(pageData,
+ curpage, targetpage,
+ targetStart, targetEnd,
+ validStart, validEnd);
+ }
+ /*
+ * If we succeeded to make diff delta, just set the header.
+ * Otherwise, make base delta.
+ */
+ if (diff)
+ {
+ header = DIFF_DELTA;
+ }
+ else
+ {
+ header = BASE_DELTA;
+ computeRegionBaseDelta(pageData,
+ curpage, targetpage,
+ targetStart, targetEnd,
+ validStart, validEnd);
+ }
+ length = pageData->deltaLen - prevDeltaLen;
+
+ writeToPtr(ptr, header);
+ writeToPtr(ptr, length);
+}
+
+/*
+ * Compute the XLOG fragments needed to transform a region of curpage into the
+ * corresponding region of targetpage, and append them to pageData's delta
+ * field. The region to transform runs from targetStart to targetEnd-1.
+ * Bytes in curpage outside the range validStart to validEnd-1 should be
+ * considered invalid, and always overwritten with target data.
+ *
+ * This function is a hot spot, so it's worth being as tense as possible
+ * about the data-matching loops.
+ */
+static void
+computeRegionBaseDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd)
{
int i,
loopEnd,
@@ -222,27 +339,458 @@ computeRegionDelta(PageData *pageData,
}
/*
+ * Align curRegion and targetRegion and return the length of the alignment
+ * or -1 if the alignment with number of mismatching positions less than
+ * or equal to MAX_ALIGN_MISMATCHES does not exist.
+ * The algorithm guarantees to find the alignment with the least possible
+ * number of mismathing positions or return that such least number is greater
+ * than MAX_ALIGN_MISMATCHES.
+ *
+ * For a good introduction to the subject, read about the "Levenshtein
+ * distance" in Wikipedia.
+ *
+ * The basic algorithm is described in:
+ * "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
+ * Algorithmica Vol. 1, 1986, pp. 251-266,
+ * <http://dx.doi.org/10.1007/BF01840446>,
+ * PDF: <http://mail.xmailserver.net/diff2.pdf>.
+ * See especially section 3, which describes the variation used below.
+ *
+ * This variation requires O(N + D ^ 2) memory and has time complexity O(ND).
+ * We choose it because it is faster than described in section 4.2 modification
+ * with O(N + D) memory requirement.
+ *
+ * The only modification we made to the original algorithm is the introduction
+ * of REPLACE operations, while in the original algorithm only INSERT and
+ * REMOVE are considered. This introduction doesn't affect time and memory
+ * complexity of the algorithm.
+ *
+ * The alignment is stored in curRegionAligned, targetRegionAligned,
+ * curRegionAlignedGap and targetRegionAlignedGap. The first two arrays
+ * contain the aligned data and the second two contain the map of align gaps
+ * in the first two arrays.
+ */
+static int
+alignRegions(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen)
+{
+ /* Number of mismatches */
+ int m;
+
+ /* Difference between curRegion and targetRegion prefix lengths */
+ int k;
+
+ /* Curbuf and targetRegion prefix lengths */
+ int i,
+ j;
+ /* Result alignment length */
+ int resLen;
+
+ /* Maximal possible result alignment length */
+ int maxResLen;
+
+ /* The length of the equal block */
+ int curLen;
+
+ /* Number of mismathes in the answer */
+ int numMismatches = -1;
+
+ /*
+ * If lengths differ too much, there is no alignment with a small
+ * number of mismatches.
+ */
+ if (!(-MAX_ALIGN_MISMATCHES < curRegionLen - targetRegionLen &&
+ curRegionLen - targetRegionLen < MAX_ALIGN_MISMATCHES))
+ return -1;
+
+ /*
+ * V is an array to store the values of dynamic programming. The first
+ * dimension corresponds to m, i. e. to the number of performed editing
+ * operations, and the second one is for k + m, where k is the number of
+ * a diagonal.
+ * A diagonal numbered k is such points (i, j) where i - j = k.
+ * Here i means the length of curRegion prefix and j means the length
+ * of targetRegion prefix.
+ * V[m][m + k] is the length of the longest prefix of curRegion i
+ * which can be aligned with the prefix of length j of targetRegion
+ * using m editing operations.
+ * In the loop below we initialize V[0][0] and then compute V[m][m + k]
+ * based on, if defined, V[m - 1][m + k - 1], V[m - 1][m + k], and
+ * V[m - 1][m + k + 1].
+ * V[m][m + k] is undefined if k < -m or k > m.
+ */
+ V[0][0] = 0;
+
+ /* Find the longest path with the given number of mismatches */
+ for (m = 0; m <= MAX_ALIGN_MISMATCHES; ++m)
+ {
+ /*
+ * Find the largest prefix alignment with the given number of
+ * mismatches and the given diagonal, i. e. difference between
+ * curRegion and targetRegion prefix lengths.
+ */
+ for (k = -m; k <= m; ++k)
+ {
+ /* Dynamic programming recurrent step */
+ if (m > 0)
+ {
+ i = -1;
+ if (k != -m && k != m &&
+ V[m - 1][m - 1 + k] + 1 > i)
+ {
+ i = V[m - 1][m - 1 + k] + 1;
+ prevDiag[m][m + k] = k;
+ }
+ if (k != -m && k != -m + 1 &&
+ V[m - 1][m - 1 + k - 1] + 1 > i)
+ {
+ i = V[m - 1][m - 1 + k - 1] + 1;
+ prevDiag[m][m + k] = k - 1;
+ }
+ if (k != m && k != m - 1 &&
+ V[m - 1][m - 1 + k + 1] > i)
+ {
+ i = V[m - 1][m - 1 + k + 1];
+ prevDiag[m][m + k] = k + 1;
+ }
+ }
+ else
+ i = 0;
+ j = i - k;
+
+ /* Boundary checks */
+ Assert(i >= 0);
+ Assert(j >= 0);
+
+ /* Increase the prefixes while the bytes are equal */
+ while (i < curRegionLen && j < targetRegionLen &&
+ curRegion[i] == targetRegion[j])
+ i++, j++;
+
+ /*
+ * Save the largest curRegion prefix that was aligned with given
+ * number of mismatches and difference between curRegion and
+ * targetRegion prefix lengths.
+ */
+ V[m][m + k] = i;
+
+ /* If we find the alignment, save its length and break */
+ if (i == curRegionLen && j == targetRegionLen)
+ {
+ numMismatches = m;
+ break;
+ }
+ }
+ /* Break if we find an alignment */
+ if (numMismatches != -1)
+ break;
+ }
+ /* No alignment was found */
+ if (numMismatches == -1)
+ return -1;
+
+ /*
+ * Restore the path k-s for each iteration of the main loop
+ * for the found alignment.
+ */
+ Assert(m >= 0);
+ while (m != 0)
+ {
+ Assert(-m <= k && k <= m);
+ alignmentDiag[m] = k;
+ k = prevDiag[m][m + k];
+ m--;
+ }
+ Assert(k == 0);
+ alignmentDiag[0] = 0;
+
+ /* Clear RegionAlignedGap arrays */
+ maxResLen = Min(curRegionLen, targetRegionLen) + numMismatches;
+ memset(curRegionAlignedGap, 0, maxResLen);
+ memset(targetRegionAlignedGap, 0, maxResLen);
+
+ /* Process the first equal block */
+ resLen = V[0][0];
+ Assert(resLen <= Min(curRegionLen, targetRegionLen));
+ memcpy(curRegionAligned, curRegion, V[0][0]);
+ memcpy(targetRegionAligned, targetRegion, V[0][0]);
+
+ /* Restore the rest of the alignment */
+ for (m = 1; m <= numMismatches; ++m)
+ {
+ /* Initialize the variables for the block */
+ int dk = alignmentDiag[m] - alignmentDiag[m - 1];
+ int prevDiag = alignmentDiag[m - 1];
+ k = alignmentDiag[m];
+ i = V[m - 1][m - 1 + prevDiag];
+ j = i - prevDiag;
+ /* Check state consistency */
+ Assert(0 <= i && i <= curRegionLen);
+ Assert(0 <= j && j <= targetRegionLen);
+
+ /* Check the block operation correctness */
+ Assert(dk == -1 || i < curRegionLen);
+ Assert(dk == 1 || j < targetRegionLen);
+ Assert(-1 <= dk && dk <= 1);
+ Assert(resLen + 1 <= maxResLen);
+
+ /* Do the alignment operation of the block */
+ if (dk == 1 || dk == 0)
+ curRegionAligned[resLen] = curRegion[i++];
+ else
+ curRegionAlignedGap[resLen] = true;
+ if (dk == 0 || dk == -1)
+ targetRegionAligned[resLen] = targetRegion[j++];
+ else
+ targetRegionAlignedGap[resLen] = true;
+ resLen++;
+
+ /* Compute the size of the equal part of the block */
+ curLen = V[m][m + k] - i;
+
+ /* Check whether the equal part of the block is computed correctly */
+ Assert(curLen >= 0);
+ Assert(resLen + curLen <= maxResLen);
+ Assert(memcmp(&curRegion[i], &targetRegion[j], curLen) == 0);
+
+ /* Copy the equal part of the block */
+ memcpy(&curRegionAligned[resLen], &curRegion[i], curLen);
+ memcpy(&targetRegionAligned[resLen], &targetRegion[j], curLen);
+ resLen += curLen;
+ }
+
+ return resLen;
+}
+
+/*
+ * Try to build a short alignment in order to produce a short diff delta.
+ * If fails, return false, otherwise return true and write the delta to
+ * pageData->delta.
+ */
+static bool
+computeRegionDiffDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd)
+{
+ char *ptr = pageData->delta + pageData->deltaLen;
+ int i,
+ j;
+ char type;
+ char len;
+ OffsetNumber start;
+ OffsetNumber tmp;
+
+ int alignmentLength;
+
+ alignmentLength = alignRegions(&curpage[validStart],
+ &targetpage[targetStart],
+ validEnd - validStart,
+ targetEnd - targetStart);
+
+ /* If no proper alignment was found return false */
+ if (alignmentLength < 0)
+ return false;
+
+ /*
+ * Translate the alignment into the set of instructions for transformation
+ * from curRegion into targetRegion, and write these instructions into
+ * pageData->delta.
+ */
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + 4 * sizeof(tmp) <= MAX_DELTA_SIZE);
+
+ /* Write start and end indexes of the buffers */
+ tmp = validStart;
+ writeToPtr(ptr, tmp);
+ tmp = validEnd;
+ writeToPtr(ptr, tmp);
+ tmp = targetStart;
+ writeToPtr(ptr, tmp);
+ tmp = targetEnd;
+ writeToPtr(ptr, tmp);
+
+ /* Transform the alignment into the set of instructions */
+ start = 0;
+ for (i = 0; i < alignmentLength; ++i)
+ {
+ /* Verify the alignment */
+ Assert(!curRegionAlignedGap[i] || !targetRegionAlignedGap[i]);
+
+ /* If the values are equal, write no instructions */
+ if (curRegionAligned[i] == targetRegionAligned[i] &&
+ !curRegionAlignedGap[i] && !targetRegionAlignedGap[i])
+ {
+ start++;
+ continue;
+ }
+
+ if (curRegionAlignedGap[i])
+ type = DIFF_INSERT;
+ else if (targetRegionAlignedGap[i])
+ type = DIFF_REMOVE;
+ else
+ type = DIFF_REPLACE;
+
+ /* Find the end of the block of the same instructions */
+ j = i + 1;
+ while (j < alignmentLength)
+ {
+ bool sameType = false;
+
+ switch (type)
+ {
+ case DIFF_INSERT:
+ sameType = (curRegionAlignedGap[j]);
+ break;
+ case DIFF_REMOVE:
+ sameType = (targetRegionAlignedGap[j]);
+ break;
+ case DIFF_REPLACE:
+ sameType = (!curRegionAlignedGap[j] &&
+ !targetRegionAlignedGap[j] &&
+ curRegionAligned[j] != targetRegionAligned[j]);
+ break;
+ default:
+ elog(ERROR, "unrecognized delta operation type: %d", type);
+ break;
+ }
+ if (sameType)
+ j++;
+ else
+ break;
+ }
+ len = j - i;
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(type) +
+ sizeof(len) + sizeof(start) <= MAX_DELTA_SIZE);
+ /* Write the header for instruction */
+ writeToPtr(ptr, type);
+ writeToPtr(ptr, len);
+ writeToPtr(ptr, start);
+
+ /* Write instruction data and go to the end of the block */
+ if (type != DIFF_REMOVE)
+ {
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + len <= MAX_DELTA_SIZE);
+ while (i < j)
+ {
+ char c = targetRegionAligned[i++];
+
+ writeToPtr(ptr, c);
+ }
+ }
+ else
+ i = j;
+ i--;
+
+ /* Shift start position which shows where to apply instruction */
+ if (type != DIFF_INSERT)
+ start += len;
+ }
+
+ pageData->deltaLen = ptr - pageData->delta;
+
+ return true;
+}
+
+/*
+ * Return whether pageData->delta contains diff deltas or not.
+ */
+static bool
+pageDataContainsDiffDelta(PageData *pageData)
+{
+ char *ptr = pageData->delta + sizeof(bool);
+ char *end = pageData->delta + pageData->deltaLen;
+ char header;
+ int length;
+
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ if (header == DIFF_DELTA)
+ return true;
+ ptr += length;
+ }
+ return false;
+}
+
+/*
+ * Downgrade pageData->delta to base delta format.
+ *
+ * Only base diffs are allowed to perform the transformation.
+ */
+static void
+downgradeDeltaToBaseFormat(PageData *pageData)
+{
+ char *ptr = pageData->delta;
+ char *end = pageData->delta + pageData->deltaLen;
+ char *cur;
+ bool containsDiffDelta;
+ char header;
+ int length;
+ int newDeltaLength = 0;
+
+ /* Check whether containsDiffDelta is false */
+ readFromPtr(ptr, containsDiffDelta);
+ Assert(!containsDiffDelta);
+
+ cur = ptr;
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ /* Check whether the region delta is base delta */
+ Assert(header == BASE_DELTA);
+ newDeltaLength += length;
+
+ memmove(cur, ptr, length);
+ cur += length;
+ ptr += length;
+ }
+ pageData->deltaLen = newDeltaLength;
+}
+
+/*
* Compute the XLOG delta record needed to transform curpage into targetpage,
* and store it in pageData's delta field.
*/
static void
computeDelta(PageData *pageData, Page curpage, Page targetpage)
{
+ bool *containsDiffDelta = pageData->delta;
int targetLower = ((PageHeader) targetpage)->pd_lower,
targetUpper = ((PageHeader) targetpage)->pd_upper,
curLower = ((PageHeader) curpage)->pd_lower,
curUpper = ((PageHeader) curpage)->pd_upper;
- pageData->deltaLen = 0;
+ pageData->deltaLen = sizeof(bool);
/* Compute delta records for lower part of page ... */
computeRegionDelta(pageData, curpage, targetpage,
0, targetLower,
- 0, curLower);
+ 0, curLower,
+ false);
/* ... and for upper part, ignoring what's between */
computeRegionDelta(pageData, curpage, targetpage,
targetUpper, BLCKSZ,
- curUpper, BLCKSZ);
+ curUpper, BLCKSZ,
+ true);
+
+ /*
+ * Set first byte to true if at least one of the region deltas
+ * is diff delta. Otherwise set first byte to false and downgrade all
+ * regions to base format without extra headers.
+ */
+ *containsDiffDelta = pageDataContainsDiffDelta(pageData);
+ if (!(*containsDiffDelta))
+ downgradeDeltaToBaseFormat(pageData);
/*
* If xlog debug is enabled, then check produced delta. Result of delta
@@ -451,12 +999,58 @@ GenericXLogAbort(GenericXLogState *state)
/*
* Apply delta to given page image.
+ *
+ * Read blocks of instructions and apply them based on their type:
+ * BASE_DELTA or DIFF_DELTA.
*/
static void
applyPageRedo(Page page, const char *delta, Size deltaSize)
{
const char *ptr = delta;
const char *end = delta + deltaSize;
+ char header;
+ int length;
+ bool containsDiffDelta;
+
+ /* If page delta is base delta, apply it. */
+ readFromPtr(ptr, containsDiffDelta);
+ if (!containsDiffDelta)
+ {
+ applyPageBaseRedo(page, ptr, end - ptr);
+ return;
+ }
+
+ /* Otherwise apply each region delta. */
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ switch (header)
+ {
+ case DIFF_DELTA:
+ ptr = applyPageDiffRedo(page, ptr, length);
+ break;
+ case BASE_DELTA:
+ ptr = applyPageBaseRedo(page, ptr, length);
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized delta type: %d",
+ header);
+ break;
+ }
+ }
+}
+
+/*
+ * Apply base delta to given page image.
+ */
+static const char *
+applyPageBaseRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
while (ptr < end)
{
@@ -472,6 +1066,80 @@ applyPageRedo(Page page, const char *delta, Size deltaSize)
ptr += length;
}
+ return ptr;
+}
+
+/*
+ * Apply diff delta to given page image.
+ */
+static const char *
+applyPageDiffRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
+ OffsetNumber targetStart,
+ targetEnd;
+ OffsetNumber validStart,
+ validEnd;
+ int i,
+ j;
+ OffsetNumber start;
+
+ /* Read start and end indexes of the buffers */
+ readFromPtr(ptr, validStart);
+ readFromPtr(ptr, validEnd);
+ readFromPtr(ptr, targetStart);
+ readFromPtr(ptr, targetEnd);
+
+ /* Read and apply the instructions */
+ i = 0, j = 0;
+ while (ptr < end)
+ {
+ char type;
+ char len;
+
+ /* Read the header of the current instruction */
+ readFromPtr(ptr, type);
+ readFromPtr(ptr, len);
+ readFromPtr(ptr, start);
+
+ /* Copy the data before current instruction to buffer */
+ memcpy(&localPage[j], page + validStart + i, start - i);
+ j += start - i;
+ i = start;
+
+ /* Apply the instruction */
+ switch (type)
+ {
+ case DIFF_INSERT:
+ memcpy(&localPage[j], ptr, len);
+ ptr += len;
+ j += len;
+ break;
+ case DIFF_REMOVE:
+ i += len;
+ break;
+ case DIFF_REPLACE:
+ memcpy(&localPage[j], ptr, len);
+ i += len;
+ j += len;
+ ptr += len;
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized delta instruction type: %d",
+ type);
+ break;
+ }
+ }
+
+ /* Copy the data after the last instruction */
+ memcpy(&localPage[j], page + validStart + i, validEnd - validStart - i);
+ j += validEnd - validStart - i;
+ i = validEnd - validStart;
+
+ memcpy(page + targetStart, localPage, j);
+ return ptr;
}
/*
On 02/07/2018 09:02 PM, Markus Nullmeier wrote:
One general comment I can already make is that enabling compression
should be made optional, which appears to be a small and easy addition
to the generic WAL API.
The new version of the patch is attached.
In order to control generic WAL compression the new structure
PageXLogCompressParams is introduced. It is passed as an additional
parameter into GenericXLogRegisterBufferEx, so the access method
can control its own WAL compression.
GenericXLogRegisterBuffer uses default compression settings which
appeared to be a reasonable tradeoff between performance overheads
and compression rate on RUM. On my HDD PostgreSQL works even 10%
faster for some RUM workloads because of reducing size of generic
WAL to be written.
Oleg Ivanov
Postgres Professional
The Russian PostgreSQL Company
Attachments:
generic_xlog_diffdelta_v3.patchtext/x-patch; name=generic_xlog_diffdelta_v3.patchDownload
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index ce02354..fa3cd4e 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -43,19 +43,41 @@
* a full page's worth of data.
*-------------------------------------------------------------------------
*/
+
+#define MAX_ALIGN_MISMATCHES 255
+/* MAX_ALIGN_MISMATCHES is not supposed to be greater than PG_UINT8_MAX */
+#if MAX_ALIGN_MISMATCHES > PG_UINT8_MAX
+#error "MAX_ALIGN_MISMATCHES must be not greater than PG_UINT8_MAX"
+#endif
+
#define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
+#define REGION_HEADER_SIZE (sizeof(char) + sizeof(int))
+#define DIFF_DELTA_HEADER_SIZE (sizeof(char) + 2 * sizeof(OffsetNumber))
+#define MISMATCH_HEADER_SIZE (sizeof(char) + sizeof(uint8) + \
+ sizeof(OffsetNumber))
#define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
-#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
+#define MAX_DELTA_SIZE (BLCKSZ + \
+ 2 * REGION_HEADER_SIZE + \
+ 2 * FRAGMENT_HEADER_SIZE + \
+ 2 * DIFF_DELTA_HEADER_SIZE + \
+ MAX_ALIGN_MISMATCHES * MISMATCH_HEADER_SIZE \
+ + sizeof(bool))
+
+#define writeToPtr(ptr, x) memcpy(ptr, &(x), sizeof(x)), ptr += sizeof(x)
+#define readFromPtr(ptr, x) memcpy(&(x), ptr, sizeof(x)), ptr += sizeof(x)
/* Struct of generic xlog data for single page */
typedef struct
{
Buffer buffer; /* registered buffer */
int flags; /* flags for this buffer */
+
int deltaLen; /* space consumed in delta field */
char *image; /* copy of page image for modification, do not
* do it in-place to have aligned memory chunk */
char delta[MAX_DELTA_SIZE]; /* delta between page images */
+
+ PageXLogCompressParams compressParams; /* compress parameters */
} PageData;
/* State of generic xlog record construction */
@@ -71,15 +93,77 @@ struct GenericXLogState
bool isLogged;
};
+/* Describes for the region which type of delta is used in it */
+typedef enum
+{
+ DIFF_DELTA = 0, /* diff delta with insert, remove and replace
+ * operations */
+ BASE_DELTA = 1 /* base delta with update operations only */
+} DeltaType;
+
+/* Diff delta operations for transforming current page to target page */
+typedef enum
+{
+ DIFF_INSERT = 0,
+ DIFF_REMOVE = 1,
+ DIFF_REPLACE = 2
+} DiffDeltaOperations;
+
+/* Describes the kind of region of the page */
+typedef enum
+{
+ UPPER_REGION = 0,
+ LOWER_REGION = 1
+} RegionType;
+
static void writeFragment(PageData *pageData, OffsetNumber offset,
OffsetNumber len, const char *data);
static void computeRegionDelta(PageData *pageData,
const char *curpage, const char *targetpage,
int targetStart, int targetEnd,
- int validStart, int validEnd);
+ int validStart, int validEnd,
+ uint8 maxMismatches);
static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
static void applyPageRedo(Page page, const char *delta, Size deltaSize);
+static int alignRegions(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen, uint8 maxMismatches);
+static int restoreCompactAlignment(const char *curRegion,
+ const char *targetRegion,
+ int curRegionLen,
+ int targetRegionLen,
+ int numMismatches);
+
+static bool computeRegionDiffDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd,
+ uint8 maxMismatches);
+static const char *applyPageDiffRedo(Page page, const char *delta, Size deltaSize);
+
+static void computeRegionBaseDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd);
+static const char *applyPageBaseRedo(Page page, const char *delta, Size deltaSize);
+
+static bool pageDataContainsDiffDelta(PageData *pageData);
+static void downgradeDeltaToBaseFormat(PageData *pageData);
+
+/* Arrays for the alignment building and for the resulting alignments */
+static int V[MAX_ALIGN_MISMATCHES + 1][2 * MAX_ALIGN_MISMATCHES + 1];
+static int prevDiag[MAX_ALIGN_MISMATCHES + 1][2 * MAX_ALIGN_MISMATCHES + 1];
+static int alignmentDiag[MAX_ALIGN_MISMATCHES + 1];
+static char curRegionAligned[MAX_ALIGN_MISMATCHES];
+static bool curRegionAlignedGap[MAX_ALIGN_MISMATCHES];
+static char targetRegionAligned[MAX_ALIGN_MISMATCHES];
+static bool targetRegionAlignedGap[MAX_ALIGN_MISMATCHES];
+static int curRegionAlignedPos[MAX_ALIGN_MISMATCHES];
+static int targetRegionAlignedPos[MAX_ALIGN_MISMATCHES];
+
+/* Array for diff delta application */
+static char localPage[BLCKSZ];
+
/*
* Write next fragment into pageData's delta.
@@ -115,14 +199,82 @@ writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
* Bytes in curpage outside the range validStart to validEnd-1 should be
* considered invalid, and always overwritten with target data.
*
- * This function is a hot spot, so it's worth being as tense as possible
- * about the data-matching loops.
+ * If forceBaseDelta is true, fucntion just calls computeRegionBaseDelta.
+ * Otherwise this function tries to build diff delta first and, if it fails,
+ * uses the base delta. It also provides the header before the delta in which
+ * the type and the length of the delta are contained.
*/
static void
computeRegionDelta(PageData *pageData,
const char *curpage, const char *targetpage,
int targetStart, int targetEnd,
- int validStart, int validEnd)
+ int validStart, int validEnd,
+ uint8 maxMismatches)
+{
+ int length;
+ char header;
+ int prevDeltaLen;
+ bool diff = false;
+ char *ptr = pageData->delta + pageData->deltaLen;
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(header) +
+ sizeof(length) <= MAX_DELTA_SIZE);
+
+ pageData->deltaLen += sizeof(header) + sizeof(length);
+ prevDeltaLen = pageData->deltaLen;
+
+ /* Not sure what to do with too big maxMismathes. Now we just clip it. */
+ if (maxMismatches > MAX_ALIGN_MISMATCHES)
+ maxMismatches = MAX_ALIGN_MISMATCHES;
+
+ /* Try building diff delta only if necessary */
+ if (maxMismatches > 0)
+ {
+ diff = computeRegionDiffDelta(pageData,
+ curpage, targetpage,
+ targetStart, targetEnd,
+ validStart, validEnd,
+ maxMismatches);
+ }
+
+ /*
+ * If we succeeded to make diff delta, just set the header. Otherwise,
+ * make base delta.
+ */
+ if (diff)
+ {
+ header = DIFF_DELTA;
+ }
+ else
+ {
+ header = BASE_DELTA;
+ computeRegionBaseDelta(pageData,
+ curpage, targetpage,
+ targetStart, targetEnd,
+ validStart, validEnd);
+ }
+ length = pageData->deltaLen - prevDeltaLen;
+
+ writeToPtr(ptr, header);
+ writeToPtr(ptr, length);
+}
+
+/*
+ * Compute the XLOG fragments needed to transform a region of curpage into the
+ * corresponding region of targetpage, and append them to pageData's delta
+ * field. The region to transform runs from targetStart to targetEnd-1.
+ * Bytes in curpage outside the range validStart to validEnd-1 should be
+ * considered invalid, and always overwritten with target data.
+ *
+ * This function is a hot spot, so it's worth being as tense as possible
+ * about the data-matching loops.
+ */
+static void
+computeRegionBaseDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd)
{
int i,
loopEnd,
@@ -222,27 +374,508 @@ computeRegionDelta(PageData *pageData,
}
/*
+ * Align curRegion and targetRegion and return the number of mismatches
+ * or -1 if the alignment with number of mismatching positions less than
+ * or equal to maxMismatches does not exist.
+ * The algorithm guarantees to find the alignment with the least possible
+ * number of mismathing positions or return that such least number is greater
+ * than maxMismatches.
+ *
+ * For a good introduction to the subject, read about the "Levenshtein
+ * distance" in Wikipedia.
+ *
+ * The basic algorithm is described in:
+ * "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
+ * Algorithmica Vol. 1, 1986, pp. 251-266,
+ * <http://dx.doi.org/10.1007/BF01840446>,
+ * PDF: <http://mail.xmailserver.net/diff2.pdf>.
+ * See especially section 3, which describes the variation used below.
+ *
+ * This variation requires O(N + D ^ 2) memory and has time complexity O(ND).
+ * We choose it because it is faster than described in section 4.2 modification
+ * with O(N + D) memory requirement.
+ *
+ * The only modification we made to the original algorithm is the introduction
+ * of REPLACE operations, while in the original algorithm only INSERT and
+ * REMOVE are considered. This introduction doesn't affect time and memory
+ * complexity of the algorithm.
+ */
+static int
+alignRegions(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen,
+ uint8 maxMismatches)
+{
+ /* Number of mismatches */
+ int m;
+
+ /* Difference between curRegion and targetRegion prefix lengths */
+ int k;
+
+ /* Curbuf and targetRegion prefix lengths */
+ int i,
+ j;
+
+ /* Number of mismathes in the answer */
+ int numMismatches = -1;
+
+ /*
+ * If lengths differ too much, there is no alignment with a small number
+ * of mismatches.
+ */
+ if (!(-maxMismatches < curRegionLen - targetRegionLen &&
+ curRegionLen - targetRegionLen < maxMismatches))
+ return -1;
+
+ /*
+ * V is an array to store the values of dynamic programming. The first
+ * dimension corresponds to m, i. e. to the number of performed editing
+ * operations, and the second one is for k + m, where k is the number of a
+ * diagonal. A diagonal numbered k is such points (i, j) where i - j = k.
+ * Here i means the length of curRegion prefix and j means the length of
+ * targetRegion prefix. V[m][m + k] is the length of the longest prefix of
+ * curRegion i which can be aligned with the prefix of length j of
+ * targetRegion using m editing operations. In the loop below we
+ * initialize V[0][0] and then compute V[m][m + k] based on, if defined,
+ * V[m - 1][m + k - 1], V[m - 1][m + k], and V[m - 1][m + k + 1]. V[m][m +
+ * k] is undefined if k < -m or k > m.
+ */
+ V[0][0] = 0;
+
+ /* Find the longest path with the given number of mismatches */
+ for (m = 0; m <= maxMismatches; ++m)
+ {
+ /*
+ * Find the largest prefix alignment with the given number of
+ * mismatches and the given diagonal, i. e. difference between
+ * curRegion and targetRegion prefix lengths.
+ */
+ for (k = -m; k <= m; ++k)
+ {
+ /* Dynamic programming recurrent step */
+ if (m > 0)
+ {
+ i = -1;
+ if (k != -m && k != m &&
+ V[m - 1][m - 1 + k] + 1 > i)
+ {
+ i = V[m - 1][m - 1 + k] + 1;
+ prevDiag[m][m + k] = k;
+ }
+ if (k != -m && k != -m + 1 &&
+ V[m - 1][m - 1 + k - 1] + 1 > i)
+ {
+ i = V[m - 1][m - 1 + k - 1] + 1;
+ prevDiag[m][m + k] = k - 1;
+ }
+ if (k != m && k != m - 1 &&
+ V[m - 1][m - 1 + k + 1] > i)
+ {
+ i = V[m - 1][m - 1 + k + 1];
+ prevDiag[m][m + k] = k + 1;
+ }
+ }
+ else
+ i = 0;
+ j = i - k;
+
+ /* Boundary checks */
+ Assert(i >= 0);
+ Assert(j >= 0);
+
+ /* Increase the prefixes while the bytes are equal */
+ while (i < curRegionLen && j < targetRegionLen &&
+ curRegion[i] == targetRegion[j])
+ i++, j++;
+
+ /*
+ * Save the largest curRegion prefix that was aligned with given
+ * number of mismatches and difference between curRegion and
+ * targetRegion prefix lengths.
+ */
+ V[m][m + k] = i;
+
+ /* If we find the alignment, save its length and break */
+ if (i == curRegionLen && j == targetRegionLen)
+ {
+ numMismatches = m;
+ break;
+ }
+ }
+ /* Break if we find an alignment */
+ if (numMismatches != -1)
+ break;
+ }
+ /* No alignment was found */
+ if (numMismatches == -1)
+ return -1;
+
+ /*
+ * Restore the path k-s for each iteration of the main loop for the found
+ * alignment.
+ */
+ Assert(m >= 0);
+ while (m != 0)
+ {
+ Assert(-m <= k && k <= m);
+ alignmentDiag[m] = k;
+ k = prevDiag[m][m + k];
+ m--;
+ }
+ Assert(k == 0);
+ alignmentDiag[0] = 0;
+
+ return numMismatches;
+}
+
+/*
+ * Restore the mismathcing parts of the alignment based on V, alignmentDiag,
+ * and numMismacthes. Do some alignment assertions also.
+ *
+ * The compressed alignment is stored in curRegionAligned, targetRegionAligned,
+ * curRegionAlignedGap, targetRegionAlignedGap, curRegionAlignedPos, and
+ * targetRegionAlignedPos. The first two arrays contain the aligned data,
+ * the second two contain the map of align gaps in the first two arrays,
+ * and the last two arrays contain the positions of the aligned parts in
+ * the original arrays.
+ */
+int
+restoreCompactAlignment(const char *curRegion, const char *targetRegion,
+ int curRegionLen, int targetRegionLen,
+ int numMismatches)
+{
+ int i,
+ j,
+ k,
+ m;
+
+ /* The length of the equal block */
+ int curLen = 0;
+
+ /* Result alignment length */
+ int resLen = 0;
+
+ /* Maximal possible result alignment length */
+ int maxResLen = Min(curRegionLen, targetRegionLen) + numMismatches;
+
+ /* Keep compiler quiet */
+ if (curLen != 0)
+ curLen = maxResLen;
+
+ /* Check whether the first equal block is computed correctly */
+ curLen = V[0][0];
+ Assert(curLen >= 0);
+ Assert(resLen + curLen <= maxResLen);
+ Assert(memcmp(curRegion, targetRegion, curLen) == 0);
+
+ /* Restore the alignment */
+ for (m = 1; m <= numMismatches; ++m)
+ {
+ /* Initialize the variables for the block */
+ int dk = alignmentDiag[m] - alignmentDiag[m - 1];
+ int prevDiag = alignmentDiag[m - 1];
+
+ k = alignmentDiag[m];
+ i = V[m - 1][m - 1 + prevDiag];
+ j = i - prevDiag;
+ /* Check state consistency */
+ Assert(0 <= i && i <= curRegionLen);
+ Assert(0 <= j && j <= targetRegionLen);
+
+ /* Check the block operation correctness */
+ Assert(dk == -1 || i < curRegionLen);
+ Assert(dk == 1 || j < targetRegionLen);
+ Assert(-1 <= dk && dk <= 1);
+ Assert(resLen + 1 <= maxResLen);
+
+ /* Do the alignment operation of the block */
+ curRegionAlignedPos[resLen] = i;
+ targetRegionAlignedPos[resLen] = j;
+ if (dk == 1 || dk == 0)
+ {
+ curRegionAlignedGap[resLen] = false;
+ curRegionAligned[resLen] = curRegion[i++];
+ }
+ else
+ curRegionAlignedGap[resLen] = true;
+ if (dk == 0 || dk == -1)
+ {
+ targetRegionAlignedGap[resLen] = false;
+ targetRegionAligned[resLen] = targetRegion[j++];
+ }
+ else
+ targetRegionAlignedGap[resLen] = true;
+ resLen++;
+
+ /* Compute the size of the equal part of the block */
+ curLen = V[m][m + k] - i;
+
+ /* Check whether the equal part of the block is computed correctly */
+ Assert(curLen >= 0);
+ Assert(resLen + curLen <= maxResLen);
+ Assert(memcmp(&curRegion[i], &targetRegion[j], curLen) == 0);
+ }
+
+ return resLen;
+}
+
+/*
+ * Try to build a short alignment in order to produce a short diff delta.
+ * If fails, return false, otherwise return true and write the delta to
+ * pageData->delta.
+ */
+static bool
+computeRegionDiffDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd,
+ uint8 maxMismatches)
+{
+ char *ptr = pageData->delta + pageData->deltaLen;
+ int i,
+ j;
+ char type;
+ uint8 len;
+ OffsetNumber start;
+ OffsetNumber tmp;
+
+ int numMismatches;
+ int alignmentLength;
+
+ int curRegionLen = validEnd - validStart;
+ int targetRegionLen = targetEnd - targetStart;
+
+ numMismatches = alignRegions(&curpage[validStart],
+ &targetpage[targetStart],
+ curRegionLen,
+ targetRegionLen,
+ maxMismatches);
+ Assert(numMismatches <= maxMismatches);
+
+ /* If no proper alignment was found return false */
+ if (numMismatches < 0)
+ return false;
+
+ /* Restore the alignment in a compact form */
+ alignmentLength = restoreCompactAlignment(&curpage[validStart],
+ &targetpage[targetStart],
+ curRegionLen,
+ targetRegionLen,
+ numMismatches);
+
+ /*
+ * Translate the alignment into the set of instructions for transformation
+ * from curRegion into targetRegion, and write these instructions into
+ * pageData->delta.
+ */
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(type) + 2 * sizeof(tmp) <= MAX_DELTA_SIZE);
+
+ /* Check whether the region is the upper or the lower part of the page */
+ Assert((validStart == 0 && targetStart == 0) ||
+ (validEnd == BLCKSZ && targetEnd == BLCKSZ));
+
+ /* Write start and end indexes of the buffers */
+ if (validStart == 0 && targetStart == 0)
+ {
+ /* We are in the upper part, there is no need to store Starts */
+ type = UPPER_REGION;
+ writeToPtr(ptr, type);
+ tmp = validEnd;
+ writeToPtr(ptr, tmp);
+ tmp = targetEnd;
+ writeToPtr(ptr, tmp);
+ }
+ else
+ {
+ /* We are in the lower part, there is no need to store Ends */
+ type = LOWER_REGION;
+ writeToPtr(ptr, type);
+ tmp = validStart;
+ writeToPtr(ptr, tmp);
+ tmp = targetStart;
+ writeToPtr(ptr, tmp);
+ }
+
+ /* Transform the alignment into the set of instructions */
+ for (i = 0; i < alignmentLength; ++i)
+ {
+ /* Verify the alignment */
+ Assert(!curRegionAlignedGap[i] || !targetRegionAlignedGap[i]);
+ Assert(curRegionAligned[i] != targetRegionAligned[i] ||
+ curRegionAlignedGap[i] || targetRegionAlignedGap[i]);
+
+ /* Determine the type of the instruction */
+ if (curRegionAlignedGap[i])
+ type = DIFF_INSERT;
+ else if (targetRegionAlignedGap[i])
+ type = DIFF_REMOVE;
+ else
+ type = DIFF_REPLACE;
+
+ /* Find the end of the block of the same instructions */
+ j = i + 1;
+ while (j < alignmentLength)
+ {
+ bool sameBlock;
+
+ sameBlock = (
+ (curRegionAlignedPos[j] <=
+ curRegionAlignedPos[j - 1] + 1) &&
+ (targetRegionAlignedPos[j] <=
+ targetRegionAlignedPos[j - 1] + 1)
+ );
+
+ switch (type)
+ {
+ case DIFF_INSERT:
+ sameBlock &= (curRegionAlignedGap[j]);
+ break;
+ case DIFF_REMOVE:
+ sameBlock &= (targetRegionAlignedGap[j]);
+ break;
+ case DIFF_REPLACE:
+ sameBlock &= (!curRegionAlignedGap[j] &&
+ !targetRegionAlignedGap[j] &&
+ (curRegionAligned[j] !=
+ targetRegionAligned[j]));
+ break;
+ default:
+ elog(ERROR, "unrecognized delta operation type: %d", type);
+ break;
+ }
+ if (sameBlock)
+ j++;
+ else
+ break;
+ }
+ len = j - i;
+
+ start = curRegionAlignedPos[i];
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(type) +
+ sizeof(len) + sizeof(start) <= MAX_DELTA_SIZE);
+ /* Write the header for instruction */
+ writeToPtr(ptr, type);
+ writeToPtr(ptr, len);
+ writeToPtr(ptr, start);
+
+ /* Write instruction data and go to the end of the block */
+ if (type != DIFF_REMOVE)
+ {
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + len <= MAX_DELTA_SIZE);
+ while (i < j)
+ {
+ char c = targetRegionAligned[i++];
+
+ writeToPtr(ptr, c);
+ }
+ }
+ else
+ i = j;
+ i--;
+ }
+
+ pageData->deltaLen = ptr - pageData->delta;
+
+ return true;
+}
+
+/*
+ * Return whether pageData->delta contains diff deltas or not.
+ */
+static bool
+pageDataContainsDiffDelta(PageData *pageData)
+{
+ char *ptr = pageData->delta + sizeof(bool);
+ char *end = pageData->delta + pageData->deltaLen;
+ char header;
+ int length;
+
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ if (header == DIFF_DELTA)
+ return true;
+ ptr += length;
+ }
+ return false;
+}
+
+/*
+ * Downgrade pageData->delta to base delta format.
+ *
+ * Only base diffs are allowed to perform the transformation.
+ */
+static void
+downgradeDeltaToBaseFormat(PageData *pageData)
+{
+ char *ptr = pageData->delta;
+ char *end = pageData->delta + pageData->deltaLen;
+ char *cur;
+ bool containsDiffDelta;
+ char header;
+ int length;
+ int newDeltaLength = 0;
+
+ /* Check whether containsDiffDelta is false */
+ readFromPtr(ptr, containsDiffDelta);
+ Assert(!containsDiffDelta);
+
+ cur = ptr;
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ /* Check whether the region delta is base delta */
+ Assert(header == BASE_DELTA);
+ newDeltaLength += length;
+
+ memmove(cur, ptr, length);
+ cur += length;
+ ptr += length;
+ }
+ pageData->deltaLen = newDeltaLength;
+}
+
+/*
* Compute the XLOG delta record needed to transform curpage into targetpage,
* and store it in pageData's delta field.
*/
static void
computeDelta(PageData *pageData, Page curpage, Page targetpage)
{
+ bool *containsDiffDelta = pageData->delta;
int targetLower = ((PageHeader) targetpage)->pd_lower,
targetUpper = ((PageHeader) targetpage)->pd_upper,
curLower = ((PageHeader) curpage)->pd_lower,
curUpper = ((PageHeader) curpage)->pd_upper;
- pageData->deltaLen = 0;
+ pageData->deltaLen = sizeof(bool);
/* Compute delta records for lower part of page ... */
computeRegionDelta(pageData, curpage, targetpage,
0, targetLower,
- 0, curLower);
+ 0, curLower,
+ pageData->compressParams.lowerMaxMismatches);
/* ... and for upper part, ignoring what's between */
computeRegionDelta(pageData, curpage, targetpage,
targetUpper, BLCKSZ,
- curUpper, BLCKSZ);
+ curUpper, BLCKSZ,
+ pageData->compressParams.upperMaxMismatches);
+
+ /*
+ * Set first byte to true if at least one of the region deltas is diff
+ * delta. Otherwise set first byte to false and downgrade all regions to
+ * base format without extra headers.
+ */
+ *containsDiffDelta = pageDataContainsDiffDelta(pageData);
+ if (!(*containsDiffDelta))
+ downgradeDeltaToBaseFormat(pageData);
/*
* If xlog debug is enabled, then check produced delta. Result of delta
@@ -291,11 +924,13 @@ GenericXLogStart(Relation relation)
* is what the caller should modify.
*
* If the buffer is already registered, just return its existing entry.
- * (It's not very clear what to do with the flags in such a case, but
- * for now we stay with the original flags.)
+ * (It's not very clear what to do with the flags and parameters in such
+ * a case, but for now we stay with the original flags and parameters.)
*/
Page
-GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
+GenericXLogRegisterBufferEx(GenericXLogState *state,
+ Buffer buffer, int flags,
+ PageXLogCompressParams compressParams)
{
int block_id;
@@ -309,6 +944,7 @@ GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
/* Empty slot, so use it (there cannot be a match later) */
page->buffer = buffer;
page->flags = flags;
+ page->compressParams = compressParams;
memcpy(page->image, BufferGetPage(buffer), BLCKSZ);
return (Page) page->image;
}
@@ -329,6 +965,22 @@ GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
}
/*
+ * An alias for GenericXLogRegisterBufferEx with default parameters
+ * which are optimal for Bloom and RUM indexes.
+ * Left for backward compatibility.
+ */
+Page
+GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
+{
+ PageXLogCompressParams defaultParameters;
+
+ defaultParameters.upperMaxMismatches = 0;
+ defaultParameters.lowerMaxMismatches = 24;
+ return GenericXLogRegisterBufferEx(state, buffer, flags,
+ defaultParameters);
+}
+
+/*
* Apply changes represented by GenericXLogState to the actual buffers,
* and emit a generic xlog record.
*/
@@ -451,12 +1103,58 @@ GenericXLogAbort(GenericXLogState *state)
/*
* Apply delta to given page image.
+ *
+ * Read blocks of instructions and apply them based on their type:
+ * BASE_DELTA or DIFF_DELTA.
*/
static void
applyPageRedo(Page page, const char *delta, Size deltaSize)
{
const char *ptr = delta;
const char *end = delta + deltaSize;
+ char header;
+ int length;
+ bool containsDiffDelta;
+
+ /* If page delta is base delta, apply it. */
+ readFromPtr(ptr, containsDiffDelta);
+ if (!containsDiffDelta)
+ {
+ applyPageBaseRedo(page, ptr, end - ptr);
+ return;
+ }
+
+ /* Otherwise apply each region delta. */
+ while (ptr < end)
+ {
+ readFromPtr(ptr, header);
+ readFromPtr(ptr, length);
+
+ switch (header)
+ {
+ case DIFF_DELTA:
+ ptr = applyPageDiffRedo(page, ptr, length);
+ break;
+ case BASE_DELTA:
+ ptr = applyPageBaseRedo(page, ptr, length);
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized delta type: %d",
+ header);
+ break;
+ }
+ }
+}
+
+/*
+ * Apply base delta to given page image.
+ */
+static const char *
+applyPageBaseRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
while (ptr < end)
{
@@ -472,6 +1170,96 @@ applyPageRedo(Page page, const char *delta, Size deltaSize)
ptr += length;
}
+ return ptr;
+}
+
+/*
+ * Apply diff delta to given page image.
+ */
+static const char *
+applyPageDiffRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
+ char type;
+ uint8 len;
+ OffsetNumber targetStart,
+ targetEnd;
+ OffsetNumber validStart,
+ validEnd;
+ int i,
+ j;
+ OffsetNumber start;
+
+ /* Read start and end indexes of the buffers */
+ validStart = 0;
+ validEnd = BLCKSZ;
+ targetStart = 0;
+ targetEnd = BLCKSZ;
+ readFromPtr(ptr, type);
+ switch (type)
+ {
+ case UPPER_REGION:
+ readFromPtr(ptr, validEnd);
+ readFromPtr(ptr, targetEnd);
+ break;
+ case LOWER_REGION:
+ readFromPtr(ptr, validStart);
+ readFromPtr(ptr, targetStart);
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized region type: %d",
+ type);
+ break;
+ }
+
+ /* Read and apply the instructions */
+ i = 0, j = 0;
+ while (ptr < end)
+ {
+ /* Read the header of the current instruction */
+ readFromPtr(ptr, type);
+ readFromPtr(ptr, len);
+ readFromPtr(ptr, start);
+
+ /* Copy the data before current instruction to buffer */
+ memcpy(&localPage[j], page + validStart + i, start - i);
+ j += start - i;
+ i = start;
+
+ /* Apply the instruction */
+ switch (type)
+ {
+ case DIFF_INSERT:
+ memcpy(&localPage[j], ptr, len);
+ ptr += len;
+ j += len;
+ break;
+ case DIFF_REMOVE:
+ i += len;
+ break;
+ case DIFF_REPLACE:
+ memcpy(&localPage[j], ptr, len);
+ i += len;
+ j += len;
+ ptr += len;
+ break;
+ default:
+ elog(ERROR,
+ "unrecognized delta instruction type: %d",
+ type);
+ break;
+ }
+ }
+
+ /* Copy the data after the last instruction */
+ memcpy(&localPage[j], page + validStart + i, validEnd - validStart - i);
+ j += validEnd - validStart - i;
+ i = validEnd - validStart;
+
+ memcpy(page + targetStart, localPage, j);
+ return ptr;
}
/*
diff --git a/src/include/access/generic_xlog.h b/src/include/access/generic_xlog.h
index b23e1f6..a8e406b 100644
--- a/src/include/access/generic_xlog.h
+++ b/src/include/access/generic_xlog.h
@@ -25,6 +25,17 @@
/* Flag bits for GenericXLogRegisterBuffer */
#define GENERIC_XLOG_FULL_IMAGE 0x0001 /* write full-page image */
+/* Struct of generic xlog compression parameters for a single page */
+typedef struct
+{
+ /*
+ * maximal number of mismatches for diff delta for the upper part of the
+ * page. Zero value disables diff delta for the part
+ */
+ uint8 upperMaxMismatches;
+ uint8 lowerMaxMismatches; /* the same for the lower part */
+} PageXLogCompressParams;
+
/* state of generic xlog record construction */
struct GenericXLogState;
typedef struct GenericXLogState GenericXLogState;
@@ -33,6 +44,8 @@ typedef struct GenericXLogState GenericXLogState;
extern GenericXLogState *GenericXLogStart(Relation relation);
extern Page GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer,
int flags);
+extern Page GenericXLogRegisterBufferEx(GenericXLogState *state, Buffer buffer,
+ int flags, PageXLogCompressParams compressParams);
extern XLogRecPtr GenericXLogFinish(GenericXLogState *state);
extern void GenericXLogAbort(GenericXLogState *state);