From 42607e0e6827b0146e423a74e3c115adaf5bf63b Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 2 Jul 2019 10:17:29 -0700 Subject: [PATCH 2/2] Fix pathological page split issue. --- src/backend/access/nbtree/nbtsplitloc.c | 109 ++++++++++++++++++------ 1 file changed, 81 insertions(+), 28 deletions(-) diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index f43ec6774b..16a3c79721 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -74,7 +74,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult); static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, - bool *newitemonleft); + bool *newitemonleft, FindSplitStrat strategy); static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy); static void _bt_interval_edges(FindSplitData *state, @@ -361,10 +361,10 @@ _bt_findsplitloc(Relation rel, * fillfactormult, in order to deal with pages with a large number of * duplicates gracefully. * - * Pass low and high splits for the entire page (including even newitem). - * These are used when the initial split interval encloses split points - * that are full of duplicates, and we need to consider if it's even - * possible to avoid appending a heap TID. + * Pass low and high splits for the entire page (or a version of the page + * that includes newitem). These are used when the initial split interval + * encloses split points that are full of duplicates, and we need to + * consider if it's even possible to avoid appending a heap TID. */ perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy); @@ -379,14 +379,18 @@ _bt_findsplitloc(Relation rel, /* * Many duplicates strategy is used when a heap TID would otherwise be * appended, but the page isn't completely full of logical duplicates. + * This increases locality of access for index scans. * * The split interval is widened to include all legal candidate split - * points. There may be a few as two distinct values in the whole-page - * split interval. Many duplicates strategy has no hard requirements for - * space utilization, though it still keeps the use of space balanced as a - * non-binding secondary goal (perfect penalty is set so that the - * first/lowest delta split points that avoids appending a heap TID is - * used). + * points on the page. There might be a few as two distinct values (or + * two distinct sets of values) in the whole-page split interval, though + * it's also possible that most of the values on the page are unique. The + * final split point will either be to the immediate left or to the + * immediate right of the group of duplicate tuples that surround the + * first/delta-optimal split point (perfect penalty was set so that the + * lowest delta split point that avoids appending a heap TID will be + * chosen). Maximizing the number of attributes that can be truncated + * away is only a goal for the default strategy. * * Single value strategy is used when it is impossible to avoid appending * a heap TID. It arranges to leave the left page very full. This @@ -399,6 +403,9 @@ _bt_findsplitloc(Relation rel, else if (strategy == SPLIT_MANY_DUPLICATES) { Assert(state.is_leaf); + /* Shouldn't try to truncate away extra user attributes */ + Assert(perfectpenalty == + IndexRelationGetNumberOfKeyAttributes(state.rel)); /* No need to resort splits -- no change in fillfactormult/deltas */ state.interval = state.nsplits; } @@ -419,7 +426,8 @@ _bt_findsplitloc(Relation rel, * the entry that has the lowest penalty, and is therefore expected to * maximize fan-out. Sets *newitemonleft for us. */ - foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft); + foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, + strategy); pfree(state.splits); return foundfirstright; @@ -753,11 +761,13 @@ _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) * page, plus a boolean indicating if new item is on left of split point. */ static OffsetNumber -_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft) +_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, + bool *newitemonleft, FindSplitStrat strategy) { int bestpenalty, lowsplit; int highsplit = Min(state->interval, state->nsplits); + SplitPoint *final; bestpenalty = INT_MAX; lowsplit = 0; @@ -781,8 +791,52 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft) } } - *newitemonleft = state->splits[lowsplit].newitemonleft; - return state->splits[lowsplit].firstoldonright; + final = &state->splits[lowsplit]; + + /* + * If "many duplicates" strategy ends up splitting far from the center of + * the page then the page as a whole will typically have few distinct + * values. There may occasionally be quite a few distinct tuples on + * either or both sides of the central group of duplicates, but that won't + * matter provided that space is consistently made available for future + * insertions of non-duplicates. Space will almost certainly be available + * for future insertions of tuples of the same duplicate value, but tuples + * with unique values that have not been inserted yet but are covered by + * the same range in the key space can be tricky. + * + * Random or monotonically increasing unique tuple insertions should not + * lead to repeated unbalanced page splits. If they're on the left hand + * side of the large group of duplicates then the large group of + * duplicates will be isolated, since the new high key for the page will + * have the same duplicate value (with truncated/-inf heap TID value). If + * random/increasing tuples appear on the right hand side of the dups then + * the new high key will be a low value among the unique values, which + * will also have acceptable space utilization. + * + * A clear problem case is monotonically _decreasing_ insertions where + * there happens to be a large group of duplicates to the left. There is + * a danger that the "many duplicates" strategy will choose a split point + * that derives the new high key from the lowest current unique value, + * which is still higher than all future unique key insertions. The same + * page will be split again and again, resulting in a succession of right + * halves that each contain very few tuples. + * + * Avoid problems with monotonically decreasing insertions by reverting to + * a delta-optimal split when the "many duplicates" split point is just to + * the left of new item's position. This suggests that values at and just + * after the would-be firstright tuple are not one large grouping. + */ + if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && + !final->newitemonleft && + final->firstoldonright >= state->newitemoff && + final->firstoldonright < state->newitemoff + 10) + { + /* Do a 50:50 page split instead */ + final = &state->splits[0]; + } + + *newitemonleft = final->newitemonleft; + return final->firstoldonright; } /* @@ -859,17 +913,16 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, *strategy = SPLIT_MANY_DUPLICATES; /* - * Caller should choose the lowest delta split that avoids appending a - * heap TID. Maximizing the number of attributes that can be - * truncated away (returning perfectpenalty when it happens to be less - * than the number of key attributes in index) can result in continual - * unbalanced page splits. + * Many duplicates strategy should split at either side the group of + * duplicates that surround the delta-optimal split point. Return + * indnkeyatts rather than the true perfect penalty to make that + * happen. (If perfectpenalty was returned here then low cardinality + * composite indexes could have continual unbalanced splits.) * - * Just avoiding appending a heap TID can still make splits very - * unbalanced, but this is self-limiting. When final split has a very - * high delta, one side of the split will likely consist of a single - * value. If that page is split once again, then that split will - * likely use the single value strategy. + * Note that caller won't go through with a many duplicates split in + * rare cases where it looks like there are ever-decreasing insertions + * to the immediate right of the split point. This must happen just + * before a final decision is made, within _bt_bestsplitloc(). */ return indnkeyatts; } @@ -879,9 +932,9 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * TIDs; otherwise, original default strategy split should proceed to * avoid pathological performance. Use page high key to infer if this is * the rightmost page among pages that store the same duplicate value. - * This should not prevent insertions of heap TIDs that are slightly out - * of order from using single value strategy, since that's expected with - * concurrent inserters of the same duplicate value. + * This won't prevent insertions of heap TIDs that are slightly out of + * order from using single value strategy, which is expected when there + * are concurrent inserters of the same duplicate value. */ else if (state->is_rightmost) *strategy = SPLIT_SINGLE_VALUE; -- 2.17.1