diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index a848a7e..31027da 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3026,42 +3026,62 @@ include_dir 'conf.d' There will be one or more active synchronous standbys; transactions waiting for commit will be allowed to proceed after these standby servers confirm receipt of their data. - The synchronous standbys will be those whose names appear - earlier in this list, and - that are both currently connected and streaming data in real-time - (as shown by a state of streaming in the - - pg_stat_replication view). - Other standby servers appearing later in this list represent potential - synchronous standbys. If any of the current synchronous - standbys disconnects for whatever reason, - it will be replaced immediately with the next-highest-priority standby. - Specifying more than one standby name can allow very high availability. This parameter specifies a list of standby servers using either of the following syntaxes: -num_sync ( standby_name [, ...] ) +[ANY] num_sync ( standby_name [, ...] ) +FIRST num_sync ( standby_name [, ...] ) standby_name [, ...] where num_sync is the number of synchronous standbys that transactions need to wait for replies from, and standby_name - is the name of a standby server. For example, a setting of - 3 (s1, s2, s3, s4) makes transaction commits wait - until their WAL records are received by three higher-priority standbys - chosen from standby servers s1, s2, - s3 and s4. + is the name of a standby server. + FIRST and ANY specify the method of + that how master server controls the standby servers. + + + FIRST means to control the standby servers with + different priorities. The synchronous standbys will be those + whose name appear earlier in this list, and that are both + currently connected and streaming data in real-time(as shown + by a state of streaming in the + + pg_stat_replication view). Other standby + servers appearing later in this list represent potential + synchronous standbys. If any of the current synchronous + standbys disconnects for whatever reason, it will be replaced + immediately with the next-highest-priority standby. + For example, a setting of FIRST 3 (s1, s2, s3, s4) + makes transaction commits wait until their WAL records are received + by three higher-priority standbys chosen from standby servers + s1, s2, s3 and s4. + + + ANY means to control all of standby servers with + same priority. The master sever will wait for receipt from + at least num_sync + standbys, which is quorum commit in the literature. The all of + listed standbys are considered as candidate of quorum commit. + For example, a setting of ANY 3 (s1, s2, s3, s4) makes + transaction commits wait until receiving receipts from at least + any three standbys of four listed servers s1, + s2, s3, s4. + + + FIRST and ANY are case-insensitive word + and the standby name having these words are must be double-quoted. - The second syntax was used before PostgreSQL + The third syntax was used before PostgreSQL version 9.6 and is still supported. It's the same as the first syntax - with num_sync equal to 1. - For example, 1 (s1, s2) and - s1, s2 have the same meaning: either s1 - or s2 is chosen as a synchronous standby. + with FIRST and num_sync + equal to 1. For example, 1 (s1, s2) and s1, s2 + have the same meaning: either s1 or s2 is + chosen as a synchronous standby. The name of a standby server for this purpose is the diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index 06f49db..bd9f427 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -1134,7 +1134,7 @@ primary_slot_name = 'node_a_slot' Synchronous replication supports one or more synchronous standby servers; - transactions will wait until all the standby servers which are considered + transactions will wait until the multiple standby servers which are considered as synchronous confirm receipt of their data. The number of synchronous standbys that transactions must wait for replies from is specified in synchronous_standby_names. This parameter also specifies @@ -1150,7 +1150,7 @@ primary_slot_name = 'node_a_slot' An example of synchronous_standby_names for multiple synchronous standbys is: -synchronous_standby_names = '2 (s1, s2, s3)' +synchronous_standby_names = 'First 2 (s1, s2, s3)' In this example, if four standby servers s1, s2, s3 and s4 are running, the two standbys @@ -1161,6 +1161,18 @@ synchronous_standby_names = '2 (s1, s2, s3)' s2 fails. s4 is an asynchronous standby since its name is not in the list. + + Another example of synchronous_standby_names for multiple + synchronous standby is: + + synchronous_standby_names = 'Any 2 (s1, s2, s3)' + + In this example, if four standby servers s1, s2, + s3 and s4 are running, the three standbys s1, + s2 and s3 wil be considered as synchronous standby + condidates. The master server will wait for at least 2 replies from them. + s4 is an asynchronous standby since its name is not in the list. + diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 0776428..dd47839 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1220,7 +1220,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i sync_state text - Synchronous state of this standby server + Synchronous state of this standby server. quorum + when standby is considered as a condidate of quorum commit. diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile index c99717e..da8bcf0 100644 --- a/src/backend/replication/Makefile +++ b/src/backend/replication/Makefile @@ -26,7 +26,7 @@ repl_gram.o: repl_scanner.c # syncrep_scanner is complied as part of syncrep_gram syncrep_gram.o: syncrep_scanner.c -syncrep_scanner.c: FLEXFLAGS = -CF -p +syncrep_scanner.c: FLEXFLAGS = -CF -p -i syncrep_scanner.c: FLEX_NO_BACKUP=yes # repl_gram.c, repl_scanner.c, syncrep_gram.c and syncrep_scanner.c diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index b442d06..bc67fce 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -76,9 +76,9 @@ char *SyncRepStandbyNames; #define SyncStandbysDefined() \ (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0') -static bool announce_next_takeover = true; +SyncRepConfigData *SyncRepConfig = NULL; -static SyncRepConfigData *SyncRepConfig = NULL; +static bool announce_next_takeover = true; static int SyncRepWaitMode = SYNC_REP_NO_WAIT; static void SyncRepQueueInsert(int mode); @@ -89,7 +89,12 @@ static bool SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, XLogRecPtr *applyPtr, bool *am_sync); +static bool SyncRepGetNNewestSyncRecPtr(XLogRecPtr *writePtr, + XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, + int pos, bool *am_sync); static int SyncRepGetStandbyPriority(void); +static int cmp_lsn(const void *a, const void *b); #ifdef USE_ASSERT_CHECKING static bool SyncRepQueueIsOrderedByLSN(int mode); @@ -384,7 +389,7 @@ SyncRepReleaseWaiters(void) XLogRecPtr writePtr; XLogRecPtr flushPtr; XLogRecPtr applyPtr; - bool got_oldest; + bool got_recptr; bool am_sync; int numwrite = 0; int numflush = 0; @@ -411,11 +416,16 @@ SyncRepReleaseWaiters(void) LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); /* - * Check whether we are a sync standby or not, and calculate the oldest - * positions among all sync standbys. + * Check whether we are a sync standby or not, and calculate the synced + * positions among all sync standbys using method. */ - got_oldest = SyncRepGetOldestSyncRecPtr(&writePtr, &flushPtr, - &applyPtr, &am_sync); + if (SyncRepConfig->sync_method == SYNC_REP_PRIORITY) + got_recptr = SyncRepGetOldestSyncRecPtr(&writePtr, &flushPtr, + &applyPtr, &am_sync); + else /* SYNC_REP_QUORUM */ + got_recptr = SyncRepGetNNewestSyncRecPtr(&writePtr, &flushPtr, + &applyPtr, SyncRepConfig->num_sync, + &am_sync); /* * If we are managing a sync standby, though we weren't prior to this, @@ -433,7 +443,7 @@ SyncRepReleaseWaiters(void) * If the number of sync standbys is less than requested or we aren't * managing a sync standby then just leave. */ - if (!got_oldest || !am_sync) + if (!got_recptr || !am_sync) { LWLockRelease(SyncRepLock); announce_next_takeover = !am_sync; @@ -469,6 +479,88 @@ SyncRepReleaseWaiters(void) } /* + * Calculate the 'pos' newest Write, Flush and Apply positions among sync standbys. + * + * Return false if the number of sync standbys is less than + * synchronous_standby_names specifies. Otherwise return true and + * store the 'pos' newest positions into *writePtr, *flushPtr, *applyPtr. + * + * On return, *am_sync is set to true if this walsender is connecting to + * sync standby. Otherwise it's set to false. + */ +static bool +SyncRepGetNNewestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, int pos, bool *am_sync) +{ + XLogRecPtr *write_array; + XLogRecPtr *flush_array; + XLogRecPtr *apply_array; + List *sync_standbys; + ListCell *cell; + int len; + int i = 0; + + *writePtr = InvalidXLogRecPtr; + *flushPtr = InvalidXLogRecPtr; + *applyPtr = InvalidXLogRecPtr; + *am_sync = false; + + /* Get standbys that are considered as synchronous at this moment */ + sync_standbys = SyncRepGetSyncStandbys(am_sync); + + /* + * Quick exit if we are not managing a sync standby or there are not + * enough synchronous standbys. + */ + if (!(*am_sync) || + SyncRepConfig == NULL || + list_length(sync_standbys) < SyncRepConfig->num_sync) + { + list_free(sync_standbys); + return false; + } + + len = list_length(sync_standbys); + write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len); + flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len); + apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * len); + + /* + * Scan through all sync standbys and calculate 'pos' Newest + * Write, Flush and Apply positions. + */ + foreach (cell, sync_standbys) + { + WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)]; + + SpinLockAcquire(&walsnd->mutex); + write_array[i] = walsnd->write; + flush_array[i]= walsnd->flush; + apply_array[i] = walsnd->flush; + SpinLockRelease(&walsnd->mutex); + + i++; + } + + /* Sort each array in descending order to get 'pos' newest element */ + qsort(write_array, len, sizeof(XLogRecPtr), cmp_lsn); + qsort(flush_array, len, sizeof(XLogRecPtr), cmp_lsn); + qsort(apply_array, len, sizeof(XLogRecPtr), cmp_lsn); + + /* Get 'pos' newest Write, Flush, Apply positions */ + *writePtr = write_array[pos - 1]; + *flushPtr = flush_array[pos - 1]; + *applyPtr = apply_array[pos - 1]; + + pfree(write_array); + pfree(flush_array); + pfree(apply_array); + list_free(sync_standbys); + + return true; +} + +/* * Calculate the oldest Write, Flush and Apply positions among sync standbys. * * Return false if the number of sync standbys is less than @@ -506,12 +598,12 @@ SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, } /* - * Scan through all sync standbys and calculate the oldest Write, Flush - * and Apply positions. + * Scan through all sync standbys and calculate the oldest + * Write, Flush and Apply positions. */ - foreach(cell, sync_standbys) + foreach (cell, sync_standbys) { - WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)]; + WalSnd *walsnd = &WalSndCtl->walsnds[lfirst_int(cell)]; XLogRecPtr write; XLogRecPtr flush; XLogRecPtr apply; @@ -535,17 +627,88 @@ SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, } /* - * Return the list of sync standbys, or NIL if no sync standby is connected. + * Return the list of sync standbys using according to synchronous method, + * or NIL if no sync standby is connected. The caller must hold SyncRepLock. * - * If there are multiple standbys with the same priority, - * the first one found is selected preferentially. - * The caller must hold SyncRepLock. + * On return, *am_sync is set to true if this walsender is connecting to + * sync standby. Otherwise it's set to false. + */ +List * +SyncRepGetSyncStandbys(bool *am_sync) +{ + /* Set default result */ + if (am_sync != NULL) + *am_sync = false; + + /* Quick exit if sync replication is not requested */ + if (SyncRepConfig == NULL) + return NIL; + + if (SyncRepConfig->sync_method == SYNC_REP_PRIORITY) + return SyncRepGetSyncStandbysPriority(am_sync); + else /* SYNC_REP_QUORUM */ + return SyncRepGetSyncStandbysQuorum(am_sync); +} + +/* + * Return the list of sync standbys using quorum method, or + * NIL if no sync standby is connected. In quorum method, all standby + * priorities are same, that is 1. So this function returns the list of + * standbys except for the standbys which are not active, or connected + * as async. + * + * On return, *am_sync is set to true if this walsender is connecting to + * sync standby. Otherwise it's set to false. + */ +List * +SyncRepGetSyncStandbysQuorum(bool *am_sync) +{ + List *result = NIL; + int i; + + for (i = 0; i < max_wal_senders; i++) + { + volatile WalSnd *walsnd = &WalSndCtl->walsnds[i]; + + /* Must be active */ + if (walsnd->pid == 0) + continue; + + /* Must be streaming */ + if (walsnd->state != WALSNDSTATE_STREAMING) + continue; + + /* Must be synchronous */ + if (walsnd->sync_standby_priority == 0) + continue; + + /* Must have a valid flush position */ + if (XLogRecPtrIsInvalid(walsnd->flush)) + continue; + + /* + * Consider this standby as candidate of sync and append + * it to the result. + */ + result = lappend_int(result, i); + if (am_sync != NULL && walsnd == MyWalSnd) + *am_sync = true; + } + + return result; +} + +/* + * Return the list of sync standbys using priority method, or + * NIL if no sync standby is connected. In priority method, + * if there are multiple standbys with the same priority, + * the first one found is selected perferentially. * * On return, *am_sync is set to true if this walsender is connecting to * sync standby. Otherwise it's set to false. */ List * -SyncRepGetSyncStandbys(bool *am_sync) +SyncRepGetSyncStandbysPriority(bool *am_sync) { List *result = NIL; List *pending = NIL; @@ -558,14 +721,6 @@ SyncRepGetSyncStandbys(bool *am_sync) volatile WalSnd *walsnd; /* Use volatile pointer to prevent code * rearrangement */ - /* Set default result */ - if (am_sync != NULL) - *am_sync = false; - - /* Quick exit if sync replication is not requested */ - if (SyncRepConfig == NULL) - return NIL; - lowest_priority = SyncRepConfig->nmembers; next_highest_priority = lowest_priority + 1; @@ -747,6 +902,10 @@ SyncRepGetStandbyPriority(void) standby_name += strlen(standby_name) + 1; } + /* In quroum method, all sync standby priorities are always 1 */ + if (found && SyncRepConfig->sync_method == SYNC_REP_QUORUM) + priority = 1; + return (found ? priority : 0); } @@ -890,6 +1049,23 @@ SyncRepQueueIsOrderedByLSN(int mode) #endif /* + * Compare lsn in order to sort array in descending order. + */ +static int +cmp_lsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 > lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* * =========================================================== * Synchronous Replication functions executed by any process * =========================================================== diff --git a/src/backend/replication/syncrep_gram.y b/src/backend/replication/syncrep_gram.y index 35c2776..e10be8b 100644 --- a/src/backend/replication/syncrep_gram.y +++ b/src/backend/replication/syncrep_gram.y @@ -21,7 +21,7 @@ SyncRepConfigData *syncrep_parse_result; char *syncrep_parse_error_msg; static SyncRepConfigData *create_syncrep_config(const char *num_sync, - List *members); + List *members, int sync_method); /* * Bison doesn't allocate anything that needs to live across parser calls, @@ -46,7 +46,7 @@ static SyncRepConfigData *create_syncrep_config(const char *num_sync, SyncRepConfigData *config; } -%token NAME NUM JUNK +%token NAME NUM JUNK ANY FIRST %type result standby_config %type standby_list @@ -60,8 +60,10 @@ result: ; standby_config: - standby_list { $$ = create_syncrep_config("1", $1); } - | NUM '(' standby_list ')' { $$ = create_syncrep_config($1, $3); } + standby_list { $$ = create_syncrep_config("1", $1, SYNC_REP_PRIORITY); } + | NUM '(' standby_list ')' { $$ = create_syncrep_config($1, $3, SYNC_REP_QUORUM); } + | ANY NUM '(' standby_list ')' { $$ = create_syncrep_config($2, $4, SYNC_REP_QUORUM); } + | FIRST NUM '(' standby_list ')' { $$ = create_syncrep_config($2, $4, SYNC_REP_PRIORITY); } ; standby_list: @@ -77,7 +79,7 @@ standby_name: static SyncRepConfigData * -create_syncrep_config(const char *num_sync, List *members) +create_syncrep_config(const char *num_sync, List *members, int sync_method) { SyncRepConfigData *config; int size; @@ -98,6 +100,7 @@ create_syncrep_config(const char *num_sync, List *members) config->config_size = size; config->num_sync = atoi(num_sync); + config->sync_method = sync_method; config->nmembers = list_length(members); ptr = config->member_names; foreach(lc, members) diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index d20662e..403fd7d 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -54,6 +54,8 @@ digit [0-9] ident_start [A-Za-z\200-\377_] ident_cont [A-Za-z\200-\377_0-9\$] identifier {ident_start}{ident_cont}* +any_ident any +first_ident first dquote \" xdstart {dquote} @@ -64,6 +66,14 @@ xdinside [^"]+ %% {space}+ { /* ignore */ } +{any_ident} { + yylval.str = pstrdup(yytext); + return ANY; + } +{first_ident} { + yylval.str = pstrdup(yytext); + return FIRST; + } {xdstart} { initStringInfo(&xdbuf); BEGIN(xd); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index c7743da..00467a4 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2862,7 +2862,8 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) if (priority == 0) values[7] = CStringGetTextDatum("async"); else if (list_member_int(sync_standbys, i)) - values[7] = CStringGetTextDatum("sync"); + values[7] = SyncRepConfig->sync_method == SYNC_REP_PRIORITY ? + CStringGetTextDatum("sync") : CStringGetTextDatum("quorum"); else values[7] = CStringGetTextDatum("potential"); } diff --git a/src/include/replication/syncrep.h b/src/include/replication/syncrep.h index e4e0e27..1b675ee 100644 --- a/src/include/replication/syncrep.h +++ b/src/include/replication/syncrep.h @@ -32,6 +32,10 @@ #define SYNC_REP_WAITING 1 #define SYNC_REP_WAIT_COMPLETE 2 +/* sync_method of SyncRepConfigData */ +#define SYNC_REP_PRIORITY 0 +#define SYNC_REP_QUORUM 1 + /* * Struct for the configuration of synchronous replication. * @@ -45,10 +49,13 @@ typedef struct SyncRepConfigData int num_sync; /* number of sync standbys that we need to * wait for */ int nmembers; /* number of members in the following list */ + int sync_method; /* synchronization method */ /* member_names contains nmembers consecutive nul-terminated C strings */ char member_names[FLEXIBLE_ARRAY_MEMBER]; } SyncRepConfigData; +extern SyncRepConfigData *SyncRepConfig; + /* communication variables for parsing synchronous_standby_names GUC */ extern SyncRepConfigData *syncrep_parse_result; extern char *syncrep_parse_error_msg; @@ -68,6 +75,8 @@ extern void SyncRepReleaseWaiters(void); /* called by wal sender and user backend */ extern List *SyncRepGetSyncStandbys(bool *am_sync); +extern List *SyncRepGetSyncStandbysPriority(bool *am_sync); +extern List *SyncRepGetSyncStandbysQuorum(bool *am_sync); /* called by checkpointer */ extern void SyncRepUpdateSyncStandbysDefined(void); diff --git a/src/test/recovery/t/007_sync_rep.pl b/src/test/recovery/t/007_sync_rep.pl index 0c87226..63cd88c 100644 --- a/src/test/recovery/t/007_sync_rep.pl +++ b/src/test/recovery/t/007_sync_rep.pl @@ -3,7 +3,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 8; +use Test::More tests => 10; # Query checking sync_priority and sync_state of each standby my $check_sql = @@ -107,7 +107,7 @@ test_sync_state( $node_master, qq(standby2|2|sync standby3|3|sync), '2 synchronous standbys', - '2(standby1,standby2,standby3)'); + 'First 2(standby1,standby2,standby3)'); # Start standby1 $node_standby_1->start; @@ -138,7 +138,7 @@ standby2|4|sync standby3|3|sync standby4|1|sync), 'num_sync exceeds the num of potential sync standbys', - '6(standby4,standby0,standby3,standby2)'); + 'First 6(standby4,standby0,standby3,standby2)'); # The setting that * comes before another standby name is acceptable # but does not make sense in most cases. Check that sync_state is @@ -150,7 +150,7 @@ standby2|2|sync standby3|2|potential standby4|2|potential), 'asterisk comes before another standby name', - '2(standby1,*,standby2)'); + 'First 2(standby1,*,standby2)'); # Check that the setting of '2(*)' chooses standby2 and standby3 that are stored # earlier in WalSnd array as sync standbys. @@ -160,7 +160,7 @@ standby2|1|sync standby3|1|sync standby4|1|potential), 'multiple standbys having the same priority are chosen as sync', - '2(*)'); + 'First 2(*)'); # Stop Standby3 which is considered in 'sync' state. $node_standby_3->stop; @@ -172,3 +172,25 @@ test_sync_state( standby2|1|sync standby4|1|potential), 'potential standby found earlier in array is promoted to sync'); + +# Check that the state of standbys listed as a voter are having +# same priority when synchronous_standby_names uses quorum method. +test_sync_state( +$node_master, qq(standby1|1|quorum +standby2|1|quorum +standby4|0|async), +'2 quorum and 1 async', +'Any 2(standby1, standby2)'); + +# Start Standby3 which will be considered in 'quorum' state. +$node_standby_3->start; + +# Check that set setting of 'Any 2(*)' chooses all standbys as +# voter. +test_sync_state( +$node_master, qq(standby1|1|quorum +standby2|1|quorum +standby3|1|quorum +standby4|1|quorum), +'all standbys are considered as condidates for quorum commit', +'Any 2(*)');