diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 6292e6c..be8d93c 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -528,8 +528,12 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, * primary bucket. We don't need to aqcuire buffer lock to fix the * primary bucket or if the previous bucket is same as write bucket, as we * already have lock on those buckets. + * If page is Bucket primary page, then prevblkno will be set with the + * value of maxbucketsize when it was split/created. So we explicitly check + * for LH_BUCKET_PAGE. */ - if (BlockNumberIsValid(prevblkno)) + if (BlockNumberIsValid(prevblkno) && + !(ovflopaque->hasho_flag & LH_BUCKET_PAGE)) { if (prevblkno == bucketblkno) prevbuf = bucketbuf; @@ -602,7 +606,8 @@ _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); MarkBufferDirty(ovflbuf); - if (BufferIsValid(prevbuf)) + if (BufferIsValid(prevbuf) && + !(ovflopaque->hasho_flag & LH_BUCKET_PAGE)) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 8e00d34..4b5c27d 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -440,6 +440,7 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) for (i = 0; i < num_buckets; i++) { BlockNumber blkno; + HashPageOpaque pageopaque; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); @@ -447,6 +448,9 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, i, LH_BUCKET_PAGE, false); + pageopaque = + (HashPageOpaque) PageGetSpecialPointer(BufferGetPage(buf)); + pageopaque->hasho_prevblkno = metap->hashm_maxbucket; MarkBufferDirty(buf); log_newpage(&rel->rd_node, @@ -881,6 +885,8 @@ restart_expand: * Okay to proceed with split. Update the metapage bucket mapping info. */ metap->hashm_maxbucket = new_bucket; + nopaque->hasho_prevblkno = metap->hashm_maxbucket; + oopaque->hasho_prevblkno = metap->hashm_maxbucket; if (new_bucket > metap->hashm_highmask) { diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 0df64a8..41a3cf0 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -112,7 +112,13 @@ _hash_readprev(Relation rel, * comments in _hash_readnext to know the reason of retaining pin. */ if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE) + { _hash_chgbufaccess(rel, *bufp, HASH_READ, HASH_NOLOCK); + + /* If it is a bucket page there will not be a prevblkno. */ + *bufp = InvalidBuffer; + return; + } else _hash_relbuf(rel, *bufp); @@ -153,10 +159,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) uint32 hashkey; Bucket bucket; BlockNumber blkno; - BlockNumber oldblkno = InvalidBuffer; - bool retry = false; Buffer buf; - Buffer metabuf; + Buffer metabuf = InvalidBuffer; Page page; HashPageOpaque opaque; HashMetaPage metap; @@ -214,96 +218,82 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) so->hashso_sk_hash = hashkey; - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - page = BufferGetPage(metabuf); - metap = HashPageGetMeta(page); - - /* - * Conditionally get the lock on primary bucket page for search while - * holding lock on meta page. If we have to wait, then release the meta - * page lock and retry it in a hard way. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* Fetch the primary bucket page for the bucket */ - buf = ReadBuffer(rel, blkno); - if (!ConditionalLockBufferShared(buf)) + if (rel->rd_amcache != NULL) { - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); - LockBuffer(buf, HASH_READ); - _hash_checkpage(rel, buf, LH_BUCKET_PAGE); - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - retry = true; + metap = (HashMetaPage)rel->rd_amcache; } else { - _hash_checkpage(rel, buf, LH_BUCKET_PAGE); + /* Read the metapage */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + /* Cache the metapage data for next time*/ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(HashMetaPageData)); + memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData)); + metap = (HashMetaPage)rel->rd_amcache; + + /* Release metapage lock, but keep pin. */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } - if (retry) + /* + * Loop until we get a lock on the correct target bucket. + */ + for (;;) { /* - * Loop until we get a lock on the correct target bucket. We get the - * lock on primary bucket page and retain the pin on it during read - * operation to prevent the concurrent splits. Retaining pin on a - * primary bucket page ensures that split can't happen as it needs to - * acquire the cleanup lock on primary bucket page. Acquiring lock on - * primary bucket and rechecking if it is a target bucket is mandatory - * as otherwise a concurrent split followed by vacuum could remove - * tuples from the selected bucket which otherwise would have been - * visible. + * Compute the target bucket number, and convert to block number. */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); - /* Release metapage lock, but keep pin. */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + /* Check if this bucket is split after we have cached the metapage. + * To do this we need to check whether cached maxbucket number is less + * than or equal to maxbucket number stored in bucket page, which was + * set with that times maxbucket number during bucket page splits. + * In case of upgrade hashno_prevblkno of old bucket page will be set + * with InvalidBlockNumber. And as of now maximum value the + * hashm_maxbucket can take is 1 less than InvalidBlockNumber + * (see _hash_expandtable). So an explicit check for InvalidBlockNumber + * in hasho_prevblkno will tell whether current bucket has been split + * after metapage was cached. + */ + if (opaque->hasho_prevblkno == InvalidBlockNumber || + opaque->hasho_prevblkno <= metap->hashm_maxbucket) + { + /* Ok now we have the right bucket proceed to search in it. */ + break; + } - /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old - * lock and lock what now appears to be the correct bucket. - */ - if (oldblkno == blkno) - break; - _hash_relbuf(rel, buf); + _hash_relbuf(rel, buf); - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + /* Meta page cache is old try again updating it. */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData)); + metap = (HashMetaPage)rel->rd_amcache; - /* - * Reacquire metapage lock and check that no bucket split has - * taken place while we were awaiting the bucket lock. - */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - } + /* Release Meta page buffer lock, but keep pin. */ + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } - /* done with the metapage */ - _hash_dropbuf(rel, metabuf); - - page = BufferGetPage(buf); - TestForOldSnapshot(scan->xs_snapshot, rel, page); - opaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == bucket); + /* Done with the metapage */ + if (!BufferIsInvalid(metabuf)) + _hash_dropbuf(rel, metabuf); so->hashso_bucket_buf = buf; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index c0434f5..8dd8130 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -60,7 +60,14 @@ typedef uint32 Bucket; typedef struct HashPageOpaqueData { - BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ + /* + * hasho_prevblkno stores previous ovfl (or bucket) blkno. And, there is a + * special case if given page is bucket primary page then hasho_prevblkno + * will store value of max bucket number during time of that bucket + * creation/split. This will be used to verify if whether cached metapage + * can be used or has to be reread. + */ + BlockNumber hasho_prevblkno; BlockNumber hasho_nextblkno; /* next ovfl blkno */ Bucket hasho_bucket; /* bucket number this pg belongs to */ uint16 hasho_flag; /* page type code, see above */