From 78cba2fcfbe11451ec6b8cd6e4c48b315571ab0d Mon Sep 17 00:00:00 2001 From: Maxim Orlov Date: Tue, 13 Aug 2024 14:44:50 +0300 Subject: [PATCH v3 3/3] Make pg_upgrade convert multixact offsets. Author: Maxim Orlov --- src/bin/pg_upgrade/Makefile | 1 + src/bin/pg_upgrade/meson.build | 1 + src/bin/pg_upgrade/pg_upgrade.c | 29 ++- src/bin/pg_upgrade/pg_upgrade.h | 13 +- src/bin/pg_upgrade/segresize.c | 350 ++++++++++++++++++++++++++++++++ 5 files changed, 390 insertions(+), 4 deletions(-) create mode 100644 src/bin/pg_upgrade/segresize.c diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index bde91e2beb..030816596f 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -21,6 +21,7 @@ OBJS = \ info.o \ option.o \ parallel.o \ + segresize.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index 9825fa3305..2d9f7e6b65 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -10,6 +10,7 @@ pg_upgrade_sources = files( 'info.c', 'option.c', 'parallel.c', + 'segresize.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 663235816f..d9d8d0ea78 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -750,7 +750,30 @@ copy_xact_xlog_xid(void) if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { - copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + /* + * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER + * it must have 32-bit multixid offsets, thus it should be converted. + */ + if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) + { + uint64 oldest_offset = convert_multixact_offsets(); + + if (oldest_offset) + { + uint64 next_offset = old_cluster.controldata.chkpnt_nxtmxoff; + + /* Handle possible wraparound. */ + if (next_offset < oldest_offset) + next_offset += ((uint64) 1 << 32) - 1; + + next_offset -= oldest_offset - 1; + old_cluster.controldata.chkpnt_nxtmxoff = next_offset; + } + } + else + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + copy_subdir_files("pg_multixact/members", "pg_multixact/members"); prep_status("Setting next multixact ID and offset for new cluster"); @@ -760,9 +783,9 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", + "\"%s/pg_resetwal\" -O %llu -m %u,%u \"%s\"", new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, + (unsigned long long) old_cluster.controldata.chkpnt_nxtmxoff, old_cluster.controldata.chkpnt_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index cdb6e2b759..157e59e38f 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,13 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * Swicth from 32-bit to 64-bit for multixid offsets. + * + * XXX: should be changed to the actual CATALOG_VERSION_NO on commit. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 202409041 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 @@ -230,7 +237,7 @@ typedef struct uint32 chkpnt_nxtepoch; uint32 chkpnt_nxtoid; uint32 chkpnt_nxtmulti; - uint32 chkpnt_nxtmxoff; + uint64 chkpnt_nxtmxoff; uint32 chkpnt_oldstMulti; uint32 chkpnt_oldstxid; uint32 align; @@ -494,3 +501,7 @@ void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr char *old_pgdata, char *new_pgdata, char *old_tablespace); bool reap_child(bool wait_for_child); + +/* segresize.c */ + +uint64 convert_multixact_offsets(void); diff --git a/src/bin/pg_upgrade/segresize.c b/src/bin/pg_upgrade/segresize.c new file mode 100644 index 0000000000..e47c0a2407 --- /dev/null +++ b/src/bin/pg_upgrade/segresize.c @@ -0,0 +1,350 @@ +/* + * segresize.c + * + * SLRU segment resize utility + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * src/bin/pg_upgrade/segresize.c + */ + +#include "postgres_fe.h" + +#include "pg_upgrade.h" +#include "access/multixact.h" + +/* See slru.h */ +#define SLRU_PAGES_PER_SEGMENT 32 + +/* + * Some kind of iterator associated with a particular SLRU segment. The idea is + * to specify the segment and page number and then move through the pages. + */ +typedef struct SlruSegState +{ + char *dir; + char *fn; + FILE *file; + int64 segno; + uint64 pageno; + bool leading_gap; + bool long_segment_names; +} SlruSegState; + +/* + * Get SLRU segmen file name from state. + * + * NOTE: this function should mirror SlruFileName call. + */ +static inline char * +SlruFileName(SlruSegState *state) +{ + if (state->long_segment_names) + { + Assert(state->segno >= 0 && + state->segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + return psprintf("%s/%015llX", state->dir, (long long) state->segno); + } + else + { + Assert(state->segno >= 0 && + state->segno <= INT64CONST(0xFFFFFF)); + return psprintf("%s/%04X", state->dir, (unsigned int) state->segno); + } +} + +/* + * Create SLRU segment file. + */ +static void +create_segment(SlruSegState *state) +{ + Assert(state->fn == NULL); + Assert(state->file == NULL); + + state->fn = SlruFileName(state); + state->file = fopen(state->fn, "wb"); + if (!state->file) + pg_fatal("could not create file \"%s\": %m", state->fn); +} + +/* + * Open existing SLRU segment file. + */ +static void +open_segment(SlruSegState *state) +{ + Assert(state->fn == NULL); + Assert(state->file == NULL); + + state->fn = SlruFileName(state); + state->file = fopen(state->fn, "rb"); + if (!state->file) + pg_fatal("could not open file \"%s\": %m", state->fn); +} + +/* + * Close SLRU segment file. + */ +static void +close_segment(SlruSegState *state) +{ + if (state->file) + { + fclose(state->file); + state->file = NULL; + } + + if (state->fn) + { + pfree(state->fn); + state->fn = NULL; + } +} + +/* + * Read next page from the old 32-bit offset segment file. + */ +static int +read_old_segment_page(SlruSegState *state, void *buf, bool *empty) +{ + int len; + + /* Open next segment file, if needed. */ + if (!state->fn) + { + if (!state->segno) + state->leading_gap = true; + + open_segment(state); + + /* Set position to the needed page. */ + if (state->pageno > 0 && + fseek(state->file, state->pageno * BLCKSZ, SEEK_SET)) + { + close_segment(state); + } + } + + if (state->file) + { + /* Segment file do exists, read page from it. */ + state->leading_gap = false; + + len = fread(buf, sizeof(char), BLCKSZ, state->file); + + /* Are we done or was there an error? */ + if (len <= 0) + { + if (ferror(state->file)) + pg_fatal("error reading file \"%s\": %m", state->fn); + + if (feof(state->file)) + { + *empty = true; + len = -1; + + close_segment(state); + } + } + else + *empty = false; + } + else if (!state->leading_gap) + { + /* We reached the last segment. */ + len = -1; + *empty = true; + } + else + { + /* Skip few first segments if they were frozen and removed. */ + len = BLCKSZ; + *empty = true; + } + + if (++state->pageno >= SLRU_PAGES_PER_SEGMENT) + { + /* Start a new segment. */ + state->segno++; + state->pageno = 0; + + close_segment(state); + } + + return len; +} + +/* + * Write next page to the new 64-bit offset segment file. + */ +static void +write_new_segment_page(SlruSegState *state, void *buf) +{ + /* + * Create a new segment file if we still didn't. Creation is + * postponed until the first non-empty page is found. This helps + * not to create completely empty segments. + */ + if (!state->file) + { + create_segment(state); + + /* Write zeroes to the previously skipped prefix. */ + if (state->pageno > 0) + { + char zerobuf[BLCKSZ] = {0}; + + for (int64 i = 0; i < state->pageno; i++) + { + if (fwrite(zerobuf, sizeof(char), BLCKSZ, state->file) != BLCKSZ) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + } + } + + /* Write page to the new segment (if it was created). */ + if (state->file) + { + if (fwrite(buf, sizeof(char), BLCKSZ, state->file) != BLCKSZ) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + + state->pageno++; + + /* + * Did we reach the maximum page number? Then close segment file + * and create a new one on the next iteration. + */ + if (state->pageno >= SLRU_PAGES_PER_SEGMENT) + { + state->segno++; + state->pageno = 0; + close_segment(state); + } +} + +/* + * Convert pg_multixact/offsets segments and return oldest multi offset. + */ +uint64 +convert_multixact_offsets(void) +{ + /* See multixact.c */ +#define MULTIXACT_OFFSETS_PER_PAGE_OLD (BLCKSZ / sizeof(uint32)) +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + + SlruSegState oldseg = {0}, + newseg = {0}; + uint32 oldbuf[MULTIXACT_OFFSETS_PER_PAGE_OLD] = {0}; + MultiXactOffset newbuf[MULTIXACT_OFFSETS_PER_PAGE] = {0}; + /* + * It is much easier to deal with multi wraparound in 64 bitd format. Thus + * we use 64 bits for multi-transactions, although they remain 32 bits. + */ + uint64 oldest_multi = old_cluster.controldata.chkpnt_oldstMulti, + next_multi = old_cluster.controldata.chkpnt_nxtmulti, + multi, + old_entry, + new_entry; + bool found = false; + uint64 oldest_offset = 0; + + prep_status("Converting pg_multixact/offsets to 64-bit"); + + oldseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE_OLD; + oldseg.segno = oldseg.pageno / SLRU_PAGES_PER_SEGMENT; + oldseg.pageno %= SLRU_PAGES_PER_SEGMENT; + oldseg.dir = psprintf("%s/pg_multixact/offsets", old_cluster.pgdata); + oldseg.long_segment_names = false; /* old format XXXX */ + + newseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE; + newseg.segno = newseg.pageno / SLRU_PAGES_PER_SEGMENT; + newseg.pageno %= SLRU_PAGES_PER_SEGMENT; + newseg.dir = psprintf("%s/pg_multixact/offsets", new_cluster.pgdata); + newseg.long_segment_names = true; + + old_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE_OLD; + new_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE; + + if (next_multi < oldest_multi) + next_multi += (uint64) 1 << 32; /* wraparound */ + + for (multi = oldest_multi; multi < next_multi; old_entry = 0) + { + int oldlen; + bool empty; + + /* Handle possible segment wraparound. */ + if (oldseg.segno > MaxMultiXactId / + MULTIXACT_OFFSETS_PER_PAGE_OLD / + SLRU_PAGES_PER_SEGMENT) + oldseg.segno = 0; + + /* Read old offset segment. */ + oldlen = read_old_segment_page(&oldseg, oldbuf, &empty); + if (oldlen <= 0 || empty) + pg_fatal("cannot read page %llu from file \"%s\": %m", + (unsigned long long) oldseg.pageno, oldseg.fn); + + /* Fill possible gap. */ + if (oldlen < BLCKSZ) + memset((char *) oldbuf + oldlen, 0, BLCKSZ - oldlen); + + /* Save oldest multi offset */ + if (!found) + { + oldest_offset = oldbuf[old_entry]; + found = true; + } + + /* ... skip wrapped-around invalid multi */ + if (multi == (uint64) 1 << 32) + { + Assert(oldseg.segno == 0); + Assert(oldseg.pageno == 1); + Assert(old_entry == 0); + + multi += FirstMultiXactId; + old_entry = FirstMultiXactId; + } + + /* Copy entries to the new page. */ + for (; multi < next_multi && old_entry < MULTIXACT_OFFSETS_PER_PAGE_OLD; + multi++, old_entry++) + { + MultiXactOffset offset = oldbuf[old_entry]; + + /* Handle possible offset wraparound. */ + if (offset < oldest_offset) + offset += ((uint64) 1 << 32) - 1; + + /* Subtract oldest_offset, so new offsets will start from 1. */ + newbuf[new_entry++] = offset - oldest_offset + 1; + if (new_entry >= MULTIXACT_OFFSETS_PER_PAGE) + { + /* Write a new page. */ + write_new_segment_page(&newseg, newbuf); + new_entry = 0; + } + } + } + + /* Write the last incomplete page. */ + if (new_entry > 0 || oldest_multi == next_multi) + { + memset(&newbuf[new_entry], 0, + sizeof(newbuf[0]) * (MULTIXACT_OFFSETS_PER_PAGE - new_entry)); + write_new_segment_page(&newseg, newbuf); + } + + /* Release resources. */ + close_segment(&oldseg); + close_segment(&newseg); + + pfree(oldseg.dir); + pfree(newseg.dir); + + check_ok(); + + return oldest_offset; +} -- 2.45.2