diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index f51c313..a8978dc 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -474,7 +474,7 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_prevblkno = metap->hashm_maxbucket; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; @@ -892,7 +892,7 @@ _hash_splitbucket(Relation rel, * vacuum will clear page_has_garbage flag after deleting such tuples. */ oopaque->hasho_flag |= LH_BUCKET_PAGE_HAS_GARBAGE | LH_BUCKET_OLD_PAGE_SPLIT; - + oopaque->hasho_prevblkno = maxbucket; npage = BufferGetPage(nbuf); /* @@ -900,7 +900,7 @@ _hash_splitbucket(Relation rel, * split is in progress. */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); - nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_prevblkno = maxbucket; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_NEW_PAGE_SPLIT; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index e3a99cf..6e3fd13 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -112,13 +112,20 @@ _hash_readprev(Relation rel, * comments in _hash_readnext to know the reason of retaining pin. */ if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE) + { _hash_chgbufaccess(rel, *bufp, HASH_READ, HASH_NOLOCK); + + /* If it is a bucket page there will not be a prevblkno. */ + *bufp = InvalidBuffer; + return; + } else _hash_relbuf(rel, *bufp); *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); + if (BlockNumberIsValid(blkno)) { *bufp = _hash_getbuf(rel, blkno, HASH_READ, @@ -153,10 +160,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) uint32 hashkey; Bucket bucket; BlockNumber blkno; - BlockNumber oldblkno = InvalidBuffer; - bool retry = false; Buffer buf; - Buffer metabuf; + Buffer metabuf = InvalidBuffer; Page page; HashPageOpaque opaque; HashMetaPage metap; @@ -214,101 +219,86 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) so->hashso_sk_hash = hashkey; - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - page = BufferGetPage(metabuf); - metap = HashPageGetMeta(page); - - /* - * Conditionally get the lock on primary bucket page for search while - * holding lock on meta page. If we have to wait, then release the meta - * page lock and retry it in a hard way. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* Fetch the primary bucket page for the bucket */ - buf = ReadBuffer(rel, blkno); - if (!ConditionalLockBufferShared(buf)) + if (rel->rd_amcache != NULL) { - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); - LockBuffer(buf, HASH_READ); - _hash_checkpage(rel, buf, LH_BUCKET_PAGE); - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - retry = true; + metap = (HashMetaPage)rel->rd_amcache; } else { - _hash_checkpage(rel, buf, LH_BUCKET_PAGE); + /* Read the metapage */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + /* Cache the metapage data for next time*/ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(HashMetaPageData)); + memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData)); + metap = (HashMetaPage)rel->rd_amcache; + + /* Release metapage lock, but keep pin. */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } - if (retry) + /* + * Loop until we get a lock on the correct target bucket. + */ + for (;;) { /* - * Loop until we get a lock on the correct target bucket. We get the - * lock on primary bucket page and retain the pin on it during read - * operation to prevent the concurrent splits. Retaining pin on a - * primary bucket page ensures that split can't happen as it needs to - * acquire the cleanup lock on primary bucket page. Acquiring lock on - * primary bucket and rechecking if it is a target bucket is mandatory - * as otherwise a concurrent split followed by vacuum could remove - * tuples from the selected bucket which otherwise would have been - * visible. + * Compute the target bucket number, and convert to block number. */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); - blkno = BUCKET_TO_BLKNO(metap, bucket); + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); - /* Release metapage lock, but keep pin. */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + /* Check if this bucket is split after we have cached the metapage. + * To do this we need to check whether cached maxbucket number is less + * than or equal to maxbucket number stored in bucket page, which was + * set with that times maxbucket number during bucket page splits. + * In case of upgrade hashno_prevblkno of old bucket page will be set + * with InvalidBlockNumber. And as of now maximum value the + * hashm_maxbucket can take is 1 less than InvalidBlockNumber + * (see _hash_expandtable). So an explicit check for InvalidBlockNumber + * in hasho_prevblkno will tell whether current bucket has been split + * after metapage was cached. + */ + if (opaque->hasho_prevblkno == InvalidBlockNumber || + opaque->hasho_prevblkno <= metap->hashm_maxbucket) + { + /* Ok now we have the right bucket proceed to search in it. */ + break; + } - /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old - * lock and lock what now appears to be the correct bucket. - */ - if (oldblkno == blkno) - break; - _hash_relbuf(rel, buf); + _hash_relbuf(rel, buf); - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + /* Meta page cache is old try again updating it. */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData)); + metap = (HashMetaPage)rel->rd_amcache; - /* - * Reacquire metapage lock and check that no bucket split has - * taken place while we were awaiting the bucket lock. - */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); - oldblkno = blkno; - } + /* Release Meta page buffer lock, but keep pin. */ + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } - /* done with the metapage */ - _hash_dropbuf(rel, metabuf); + /* Done with the metapage */ + if (!BufferIsInvalid(metabuf)) + _hash_dropbuf(rel, metabuf); /* Update scan opaque state to show we have lock on the bucket */ so->hashso_bucket = bucket; so->hashso_bucket_valid = true; - - - page = BufferGetPage(buf); - opaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == bucket); - so->hashso_bucket_buf = buf; /*