WIP patch for serializable transactions with predicate locking

Started by Kevin Grittnerover 15 years ago3 messages

Kevin.Grittner@wicourts.gov

over 15 years ago

1 attachment(s)

I heard that others were considering work on predicate locks for
9.1. Since Dan Ports of MIT and I have been working on that for the
serializable implementation for the last few weeks, I felt it would
be good to post a WIP patch to avoid duplicate effort. This
implementation compiles without warning, passes all regression
tests, and passes several hundred permutations of a dozen basic
tests which are intended to confirm correct predicate locking.
SIREAD locks are generally taken at the tuple level for the heap,
with granularity promotion to page and then relation level as needed
to prevent resource exhaustion. We've been using indexes to
implement predicate locking, and currently have page level locking
for btree indexes and only relation level locking otherwise, but Dan
is actively working on getting btree down to next-key locking, and
when I return from PGCon I will be working on the other index AMs.

We're currently using Markus Wanner's dtester for testing, but that
can be moved out of the patch if we don't want to have a dependency
on it.

-Kevin

Attachments:

serializable-1.patchapplication/octet-stream; name=serializable-1.patchDownload

*** a/GNUmakefile.in
--- b/GNUmakefile.in
***************
*** 75,81 **** distclean maintainer-clean:
  
  check: all
  
! check installcheck installcheck-parallel:
  	$(MAKE) -C src/test $@
  
  installcheck-world:
--- 75,81 ----
  
  check: all
  
! check dcheck installcheck installcheck-parallel:
  	$(MAKE) -C src/test $@
  
  installcheck-world:
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 57,62 ****
--- 57,63 ----
  #include "storage/bufmgr.h"
  #include "storage/freespace.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "storage/standby.h"
***************
*** 261,280 **** heapgetpage(HeapScanDesc scan, BlockNumber page)
  	{
  		if (ItemIdIsNormal(lpp))
  		{
  			bool		valid;
  
  			if (all_visible)
  				valid = true;
  			else
  			{
- 				HeapTupleData loctup;
- 
  				loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
  				loctup.t_len = ItemIdGetLength(lpp);
  				ItemPointerSet(&(loctup.t_self), page, lineoff);
  
  				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
  			}
  			if (valid)
  				scan->rs_vistuples[ntup++] = lineoff;
  		}
--- 262,283 ----
  	{
  		if (ItemIdIsNormal(lpp))
  		{
+ 			HeapTupleData loctup;
  			bool		valid;
  
  			if (all_visible)
  				valid = true;
  			else
  			{
  				loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
  				loctup.t_len = ItemIdGetLength(lpp);
  				ItemPointerSet(&(loctup.t_self), page, lineoff);
  
  				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
  			}
+ 
+ 			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer);
+ 
  			if (valid)
  				scan->rs_vistuples[ntup++] = lineoff;
  		}
***************
*** 468,479 **** heapgettup(HeapScanDesc scan,
--- 471,485 ----
  													 snapshot,
  													 scan->rs_cbuf);
  
+ 				CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf);
+ 
  				if (valid && key != NULL)
  					HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
  								nkeys, key, valid);
  
  				if (valid)
  				{
+ 					PredicateLockTuple(scan->rs_rd, tuple);
  					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  					return;
  				}
***************
*** 741,752 **** heapgettup_pagemode(HeapScanDesc scan,
--- 747,760 ----
  							nkeys, key, valid);
  				if (valid)
  				{
+ 					PredicateLockTuple(scan->rs_rd, tuple);
  					scan->rs_cindex = lineindex;
  					return;
  				}
  			}
  			else
  			{
+ 				PredicateLockTuple(scan->rs_rd, tuple);
  				scan->rs_cindex = lineindex;
  				return;
  			}
***************
*** 1460,1467 **** heap_fetch(Relation relation,
--- 1468,1478 ----
  
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
+ 	CheckForSerializableConflictOut(valid, relation, tuple, buffer);
+ 
  	if (valid)
  	{
+ 		PredicateLockTuple(relation, tuple);
  		/*
  		 * All checks passed, so return the tuple as valid. Caller is now
  		 * responsible for releasing the buffer.
***************
*** 1505,1517 **** heap_fetch(Relation relation,
   * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
   */
  bool
! heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
! 					   bool *all_dead)
  {
  	Page		dp = (Page) BufferGetPage(buffer);
  	TransactionId prev_xmax = InvalidTransactionId;
  	OffsetNumber offnum;
  	bool		at_chain_start;
  
  	if (all_dead)
  		*all_dead = true;
--- 1516,1530 ----
   * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
   */
  bool
! heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
! 					   Snapshot snapshot, bool *all_dead)
  {
  	Page		dp = (Page) BufferGetPage(buffer);
  	TransactionId prev_xmax = InvalidTransactionId;
  	OffsetNumber offnum;
  	bool		at_chain_start;
+ 	bool		valid;
+ 	bool		match_found;
  
  	if (all_dead)
  		*all_dead = true;
***************
*** 1521,1526 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
--- 1534,1540 ----
  	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
  	offnum = ItemPointerGetOffsetNumber(tid);
  	at_chain_start = true;
+ 	match_found = false;
  
  	/* Scan through possible multiple members of HOT-chain */
  	for (;;)
***************
*** 1551,1556 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
--- 1565,1572 ----
  
  		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
  		heapTuple.t_len = ItemIdGetLength(lp);
+ 		heapTuple.t_tableOid = relation->rd_id;
+ 		heapTuple.t_self = *tid;
  
  		/*
  		 * Shouldn't see a HEAP_ONLY tuple at chain start.
***************
*** 1568,1579 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
  			break;
  
  		/* If it's visible per the snapshot, we must return it */
! 		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
  		{
  			ItemPointerSetOffsetNumber(tid, offnum);
  			if (all_dead)
  				*all_dead = false;
! 			return true;
  		}
  
  		/*
--- 1584,1601 ----
  			break;
  
  		/* If it's visible per the snapshot, we must return it */
! 		valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer);
! 		CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer);
! 		if (valid)
  		{
  			ItemPointerSetOffsetNumber(tid, offnum);
+ 			PredicateLockTuple(relation, &heapTuple);
  			if (all_dead)
  				*all_dead = false;
! 			if (IsXactIsoLevelFullySerializable)
! 				match_found = true;
! 			else
! 				return true;
  		}
  
  		/*
***************
*** 1602,1608 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
  			break;				/* end of chain */
  	}
  
! 	return false;
  }
  
  /*
--- 1624,1630 ----
  			break;				/* end of chain */
  	}
  
! 	return match_found;
  }
  
  /*
***************
*** 1621,1627 **** heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
  
  	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
  	LockBuffer(buffer, BUFFER_LOCK_SHARE);
! 	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	ReleaseBuffer(buffer);
  	return result;
--- 1643,1649 ----
  
  	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
  	LockBuffer(buffer, BUFFER_LOCK_SHARE);
! 	result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	ReleaseBuffer(buffer);
  	return result;
***************
*** 1728,1735 **** heap_get_latest_tid(Relation relation,
--- 1750,1760 ----
  		 * result candidate.
  		 */
  		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
+ 		CheckForSerializableConflictOut(valid, relation, &tp, buffer);
  		if (valid)
+ 		{
  			*tid = ctid;
+ 		}
  
  		/*
  		 * If there's a valid t_ctid link, follow it, else we're done.
***************
*** 1892,1897 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid,
--- 1917,1929 ----
  	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
  									   InvalidBuffer, options, bistate);
  
+ 	/*
+ 	 * We're about to do the actual insert -- check for conflict at the
+ 	 * relation or buffer level first, to avoid possibly having to roll
+ 	 * back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, NULL, buffer);
+ 
  	/* NO EREPORT(ERROR) from here till changes are logged */
  	START_CRIT_SECTION();
  
***************
*** 2192,2197 **** l1:
--- 2224,2235 ----
  		return result;
  	}
  
+ 	/*
+ 	 * We're about to do the actual delete -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, &tp, buffer);
+ 
  	/* replace cid with a combo cid if necessary */
  	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
  
***************
*** 2545,2550 **** l2:
--- 2583,2594 ----
  		return result;
  	}
  
+ 	/*
+ 	 * We're about to do the actual update -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, &oldtup, buffer);
+ 
  	/* Fill in OID and transaction status data for newtup */
  	if (relation->rd_rel->relhasoids)
  	{
***************
*** 2690,2695 **** l2:
--- 2734,2749 ----
  	}
  
  	/*
+ 	 * We're about to create the new tuple -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 *
+ 	 * NOTE: For a tuple insert, we only need to check for table locks, since
+ 	 * predicate locking at the index level will cover ranges for anything
+ 	 * except a table scan.  Therefore, only provide the relation.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+ 
+ 	/*
  	 * At this point newbuf and buffer are both pinned and locked, and newbuf
  	 * has enough space for the new tuple.	If they are the same buffer, only
  	 * one pin is held.
***************
*** 2829,2834 **** l2:
--- 2883,2894 ----
  	CacheInvalidateHeapTuple(relation, heaptup);
  
  	/*
+ 	 * TODO SSI: In order to support SIREAD locks at tuple granularity, any
+ 	 *           existing SIREAD locks on the old tuple must be copied to
+ 	 *           also refer to the new tuple, somewhere around this point?
+ 	 */
+ 
+ 	/*
  	 * Release the lmgr tuple lock, if we had it.
  	 */
  	if (have_tuple_lock)
*** a/src/backend/access/index/indexam.c
--- b/src/backend/access/index/indexam.c
***************
*** 64,72 ****
--- 64,74 ----
  
  #include "access/relscan.h"
  #include "access/transam.h"
+ #include "access/xact.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "utils/relcache.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
***************
*** 192,197 **** index_insert(Relation indexRelation,
--- 194,204 ----
  	RELATION_CHECKS;
  	GET_REL_PROCEDURE(aminsert);
  
+ 	if (!(indexRelation->rd_am->ampredlocks))
+ 		CheckForSerializableConflictIn(indexRelation,
+ 									   (HeapTuple) NULL,
+ 									   InvalidBuffer);
+ 
  	/*
  	 * have the am's insert proc do all the work.
  	 */
***************
*** 266,271 **** index_beginscan_internal(Relation indexRelation,
--- 273,281 ----
  	RELATION_CHECKS;
  	GET_REL_PROCEDURE(ambeginscan);
  
+ 	if (!(indexRelation->rd_am->ampredlocks))
+ 		PredicateLockRelation(indexRelation);
+ 
  	/*
  	 * We hold a reference count to the relcache entry throughout the scan.
  	 */
***************
*** 515,520 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
--- 525,531 ----
  		{
  			ItemId		lp;
  			ItemPointer ctid;
+ 			bool		valid;
  
  			/* check for bogus TID */
  			if (offnum < FirstOffsetNumber ||
***************
*** 569,576 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				break;
  
  			/* If it's visible per the snapshot, we must return it */
! 			if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
! 											 scan->xs_cbuf))
  			{
  				/*
  				 * If the snapshot is MVCC, we know that it could accept at
--- 580,592 ----
  				break;
  
  			/* If it's visible per the snapshot, we must return it */
! 			valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
! 												 scan->xs_cbuf);
! 
! 			CheckForSerializableConflictOut(valid, scan->heapRelation,
! 											heapTuple, scan->xs_cbuf);
! 
! 			if (valid)
  			{
  				/*
  				 * If the snapshot is MVCC, we know that it could accept at
***************
*** 578,584 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot))
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
--- 594,601 ----
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot)
! 					&& !IsXactIsoLevelFullySerializable)
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
***************
*** 594,599 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
--- 611,618 ----
  
  				pgstat_count_heap_fetch(scan->indexRelation);
  
+ 				PredicateLockTuple(scan->heapRelation, heapTuple);
+ 
  				return heapTuple;
  			}
  
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***************
*** 21,26 ****
--- 21,27 ----
  #include "miscadmin.h"
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "utils/inval.h"
  #include "utils/tqual.h"
  
***************
*** 175,180 **** top:
--- 176,189 ----
  
  	if (checkUnique != UNIQUE_CHECK_EXISTING)
  	{
+ 		/*
+ 		 * The only conflict predicate locking cares about for indexes is when
+ 		 * an index tuple insert conflicts with an existing lock.  Since the
+ 		 * actual location of the insert is hard to predict because of the
+ 		 * random search used to prevent O(N^2) performance when there are many
+ 		 * duplicate entries, we can just use the "first valid" page.
+ 		 */
+ 		CheckForSerializableConflictIn(rel, NULL, buf);
  		/* do the insertion */
  		_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
  		_bt_insertonpg(rel, buf, stack, itup, offset, false);
***************
*** 697,702 **** _bt_insertonpg(Relation rel,
--- 706,714 ----
  		/* split the buffer into left and right halves */
  		rbuf = _bt_split(rel, buf, firstright,
  						 newitemoff, itemsz, itup, newitemonleft);
+ 		PredicateLockPageSplit(rel,
+ 							   BufferGetBlockNumber(buf),
+ 							   BufferGetBlockNumber(rbuf));
  
  		/*----------
  		 * By here,
*** a/src/backend/access/nbtree/nbtpage.c
--- b/src/backend/access/nbtree/nbtpage.c
***************
*** 1177,1182 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
--- 1177,1188 ----
  	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
  
  	/*
+ 	 * Any insert which would have gone on the target block will now go to the
+ 	 * right sibling block.
+ 	 */
+ 	PredicateLockPageCombine(rel, target, rightsib);
+ 
+ 	/*
  	 * Next find and write-lock the current parent of the target page. This is
  	 * essentially the same as the corresponding step of splitting.
  	 */
*** a/src/backend/access/nbtree/nbtsearch.c
--- b/src/backend/access/nbtree/nbtsearch.c
***************
*** 21,26 ****
--- 21,27 ----
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/lsyscache.h"
  #include "utils/rel.h"
  
***************
*** 63,69 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
--- 64,73 ----
  
  	/* If index is empty and access = BT_READ, no root page is created. */
  	if (!BufferIsValid(*bufP))
+ 	{
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		return (BTStack) NULL;
+ 	}
  
  	/* Loop iterates once per level descended in the tree */
  	for (;;)
***************
*** 88,94 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
--- 92,102 ----
  		page = BufferGetPage(*bufP);
  		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  		if (P_ISLEAF(opaque))
+ 		{
+ 			if (access == BT_READ)
+ 				PredicateLockPage(rel, BufferGetBlockNumber(*bufP));
  			break;
+ 		}
  
  		/*
  		 * Find the appropriate item on the internal page, and get the child
***************
*** 199,204 **** _bt_moveright(Relation rel,
--- 207,213 ----
  		elog(ERROR, "fell off the end of index \"%s\"",
  			 RelationGetRelationName(rel));
  
+ 	PredicateLockPage(rel, BufferGetBlockNumber(buf));
  	return buf;
  }
  
***************
*** 1142,1147 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir)
--- 1151,1157 ----
  			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  			if (!P_IGNORE(opaque))
  			{
+ 				PredicateLockPage(rel, blkno);
  				/* see if there are any matches on this page */
  				/* note that this will clear moreRight if we can stop */
  				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
***************
*** 1189,1194 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir)
--- 1199,1205 ----
  			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  			if (!P_IGNORE(opaque))
  			{
+ 				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf));
  				/* see if there are any matches on this page */
  				/* note that this will clear moreLeft if we can stop */
  				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
***************
*** 1352,1357 **** _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
--- 1363,1369 ----
  	if (!BufferIsValid(buf))
  	{
  		/* empty index... */
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		return InvalidBuffer;
  	}
  
***************
*** 1431,1440 **** _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
--- 1443,1454 ----
  	if (!BufferIsValid(buf))
  	{
  		/* empty index... */
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		so->currPos.buf = InvalidBuffer;
  		return false;
  	}
  
+ 	PredicateLockPage(rel, BufferGetBlockNumber(buf));
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	Assert(P_ISLEAF(opaque));
*** a/src/backend/access/transam/xact.c
--- b/src/backend/access/transam/xact.c
***************
*** 39,44 ****
--- 39,45 ----
  #include "storage/bufmgr.h"
  #include "storage/fd.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "storage/procarray.h"
  #include "storage/sinvaladt.h"
  #include "storage/smgr.h"
***************
*** 1749,1754 **** CommitTransaction(void)
--- 1750,1762 ----
  	AtEOXact_LargeObject(true);
  
  	/*
+ 	 * Mark serializable transaction as complete for predicate locking
+ 	 * purposes.  This should be done as late as we can put it and still
+ 	 * allow errors to be raised for failure patterns found at commit.
+ 	 */
+ 	PreCommit_CheckForSerializationFailure();
+ 
+ 	/*
  	 * Insert notifications sent by NOTIFY commands into the queue.  This
  	 * should be late in the pre-commit sequence to minimize time spent
  	 * holding the notify-insertion lock.
*** a/src/backend/catalog/index.c
--- b/src/backend/catalog/index.c
***************
*** 2044,2050 **** IndexCheckExclusion(Relation heapRelation,
   *
   * After completing validate_index(), we wait until all transactions that
   * were alive at the time of the reference snapshot are gone; this is
!  * necessary to be sure there are none left with a serializable snapshot
   * older than the reference (and hence possibly able to see tuples we did
   * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
   * transactions will be able to use it for queries.
--- 2044,2050 ----
   *
   * After completing validate_index(), we wait until all transactions that
   * were alive at the time of the reference snapshot are gone; this is
!  * necessary to be sure there are none left with a transaction-based snapshot
   * older than the reference (and hence possibly able to see tuples we did
   * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
   * transactions will be able to use it for queries.
*** a/src/backend/commands/trigger.c
--- b/src/backend/commands/trigger.c
***************
*** 2360,2366 **** ltrmark:;
  
  			case HeapTupleUpdated:
  				ReleaseBuffer(buffer);
! 				if (IsXactIsoLevelSerializable)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
--- 2360,2366 ----
  
  			case HeapTupleUpdated:
  				ReleaseBuffer(buffer);
! 				if (IsXactIsoLevelXactSnapshotBased)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/execMain.c
--- b/src/backend/executor/execMain.c
***************
*** 1538,1544 **** EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
  
  				case HeapTupleUpdated:
  					ReleaseBuffer(buffer);
! 					if (IsXactIsoLevelSerializable)
  						ereport(ERROR,
  								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  								 errmsg("could not serialize access due to concurrent update")));
--- 1538,1544 ----
  
  				case HeapTupleUpdated:
  					ReleaseBuffer(buffer);
! 					if (IsXactIsoLevelXactSnapshotBased)
  						ereport(ERROR,
  								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  								 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeBitmapHeapscan.c
--- b/src/backend/executor/nodeBitmapHeapscan.c
***************
*** 42,47 ****
--- 42,48 ----
  #include "executor/nodeBitmapHeapscan.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/memutils.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
***************
*** 351,357 **** bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
  			ItemPointerData tid;
  
  			ItemPointerSet(&tid, page, offnum);
! 			if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL))
  				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
  		}
  	}
--- 352,358 ----
  			ItemPointerData tid;
  
  			ItemPointerSet(&tid, page, offnum);
! 			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL))
  				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
  		}
  	}
*** a/src/backend/executor/nodeIndexscan.c
--- b/src/backend/executor/nodeIndexscan.c
***************
*** 30,35 ****
--- 30,36 ----
  #include "executor/execdebug.h"
  #include "executor/nodeIndexscan.h"
  #include "optimizer/clauses.h"
+ #include "storage/predicate.h"
  #include "utils/array.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
*** a/src/backend/executor/nodeLockRows.c
--- b/src/backend/executor/nodeLockRows.c
***************
*** 130,136 **** lnext:
  				break;
  
  			case HeapTupleUpdated:
! 				if (IsXactIsoLevelSerializable)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
--- 130,136 ----
  				break;
  
  			case HeapTupleUpdated:
! 				if (IsXactIsoLevelXactSnapshotBased)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeModifyTable.c
--- b/src/backend/executor/nodeModifyTable.c
***************
*** 328,334 **** ldelete:;
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelSerializable)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
--- 328,334 ----
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelXactSnapshotBased)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
***************
*** 516,522 **** lreplace:;
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelSerializable)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
--- 516,522 ----
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelXactSnapshotBased)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeSeqscan.c
--- b/src/backend/executor/nodeSeqscan.c
***************
*** 28,33 ****
--- 28,34 ----
  #include "access/relscan.h"
  #include "executor/execdebug.h"
  #include "executor/nodeSeqscan.h"
+ #include "storage/predicate.h"
  
  static void InitScanRelation(SeqScanState *node, EState *estate);
  static TupleTableSlot *SeqNext(SeqScanState *node);
***************
*** 105,115 **** SeqRecheck(SeqScanState *node, TupleTableSlot *slot)
--- 106,118 ----
   *		tuple.
   *		We call the ExecScan() routine and pass it the appropriate
   *		access method functions.
+  *		For serializable transactions, we first lock the entire relation.
   * ----------------------------------------------------------------
   */
  TupleTableSlot *
  ExecSeqScan(SeqScanState *node)
  {
+ 	PredicateLockRelation(node->ss_currentRelation);
  	return ExecScan((ScanState *) node,
  					(ExecScanAccessMtd) SeqNext,
  					(ExecScanRecheckMtd) SeqRecheck);
*** a/src/backend/executor/nodeTidscan.c
--- b/src/backend/executor/nodeTidscan.c
***************
*** 31,36 ****
--- 31,37 ----
  #include "executor/nodeTidscan.h"
  #include "optimizer/clauses.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/array.h"
  
  
*** a/src/backend/storage/ipc/ipci.c
--- b/src/backend/storage/ipc/ipci.c
***************
*** 105,110 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
--- 105,111 ----
  												 sizeof(ShmemIndexEnt)));
  		size = add_size(size, BufferShmemSize());
  		size = add_size(size, LockShmemSize());
+ 		size = add_size(size, PredicateLockShmemSize());
  		size = add_size(size, ProcGlobalShmemSize());
  		size = add_size(size, XLOGShmemSize());
  		size = add_size(size, CLOGShmemSize());
***************
*** 200,205 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
--- 201,211 ----
  	InitLocks();
  
  	/*
+ 	 * Set up predicate lock manager
+ 	 */
+ 	InitPredicateLocks();
+ 
+ 	/*
  	 * Set up process table
  	 */
  	if (!IsUnderPostmaster)
*** a/src/backend/storage/ipc/shmqueue.c
--- b/src/backend/storage/ipc/shmqueue.c
***************
*** 43,56 **** SHMQueueInit(SHM_QUEUE *queue)
   * SHMQueueIsDetached -- TRUE if element is not currently
   *		in a queue.
   */
- #ifdef NOT_USED
  bool
  SHMQueueIsDetached(SHM_QUEUE *queue)
  {
  	Assert(ShmemAddrIsValid(queue));
  	return (queue->prev == NULL);
  }
- #endif
  
  /*
   * SHMQueueElemInit -- clear an element's links
--- 43,54 ----
*** a/src/backend/storage/lmgr/Makefile
--- b/src/backend/storage/lmgr/Makefile
***************
*** 12,18 **** subdir = src/backend/storage/lmgr
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 12,18 ----
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o predicate.o
  
  include $(top_srcdir)/src/backend/common.mk
  
*** /dev/null
--- b/src/backend/storage/lmgr/predicate.c
***************
*** 0 ****
--- 1,2373 ----
+ /*-------------------------------------------------------------------------
+  *
+  * predicate.c
+  *	  POSTGRES predicate locking
+  *	  to support full serializable transaction isolation
+  *
+  * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+  * locks, which are so different from normal locks that a distinct set of
+  * structures is required to handle them.
+  *
+  * (1)	Besides tuples actually read, they must cover ranges of tuples
+  *		which would have been read based on the predicate.	This will
+  *		require modelling the predicates through locks against database
+  *		objects such as pages, index ranges, or entire tables.
+  *
+  * (2)	They must be kept in RAM for quick access.	Because of this, it
+  *		isn't possible to always maintain tuple-level granularity -- when
+  *		the space allocated to store these approaches exhaustion, a
+  *		request for a lock may need to scan for situations where a single
+  *		transaction holds many fine-grained locks which can be coalesced
+  *		into a single coarser-grained lock.
+  *
+  * (3)	They never block anything; they are more like flags than locks
+  *		in that regard; although they refer to database objects and are
+  *		used to identify rw-conflicts with normal write locks.
+  *
+  * (4)	While they are associated with a transaction, they must survive
+  *		a successful COMMIT of that transaction, and remain until all
+  *		overlapping transactions complete.	This even means that they
+  *		must survive termination of the transaction's process.  On a
+  *		rollback of the top level transaction, all of that transaction's
+  *		SIREAD locks should be released, however.
+  *
+  * (5)	The only transactions which create SIREAD locks or check for
+  *		conflicts with them are serializable transactions.
+  *
+  * (6)	When a write lock for a top level transaction is found to cover
+  *		an existing SIREAD lock for the same transaction, the SIREAD lock
+  *		can be deleted.
+  *
+  * (7)	A write from a serializable transaction must ensure that a xact
+  *		record exists for the transaction, with the same lifespan (until
+  *		all concurrent transaction complete or the transaction is rolled
+  *		back) so that rw-dependencies to that transaction can be
+  *		detected.
+  *
+  *
+  * Lightweight locks to manage access to the predicate locking shared
+  * memory objects must be taken in this order, and should be released in
+  * reverse order:
+  *
+  *	SerializableFinishedListLock
+  *		- Protects the list of transaction which have completed but which
+  *			may yet matter because they overlap still-active transactions.
+  *
+  *	SerializablePredicateLockListLock
+  *		- Special handling: use shared mode for walking the list *and*
+  *			for modifying the list from the process running the owning
+  *			transaction.  No other process is allowed to walk the list,
+  *			and any other process must acquire exclusive access to modify
+  *			it.  Once a transaction has completed, it is the holder of
+  *			the SerializableFinishedListLock who can walk the list in
+  *			shared mode.
+  *
+  *	FirstPredicateLockMgrLock based partition locks
+  *		- The same lock protects a target and all locks on that target.
+  *		- When more than one is needed, acquire in ascending order.
+  *
+  *	SerializableXactHashLock
+  *		- Protects both SerializableXactHash and SerializableXidHash.
+  *
+  *
+  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *
+  * IDENTIFICATION
+  *	  $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ /*
+  * INTERFACE ROUTINES
+  *
+  * housekeeping for setting up shared memory predicate lock structures
+  *		InitPredicateLocks(void)
+  *		PredicateLockShmemSize(void)
+  *
+  * predicate lock reporting
+  *		PredicateLockData *GetPredicateLockStatusData(void)
+  *
+  * predicate lock maintenance
+  *		RegisterSerializableTransaction(Snapshot snapshot)
+  *		PredicateLockRelation(Relation relation)
+  *		PredicateLockPage(Relation relation, BlockNumber blkno)
+  *		PredicateLockTuple(Relation relation, HeapTuple tuple)
+  *		PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+  *							   BlockNumber newblkno);
+  *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+  *								 BlockNumber newblkno);
+  *		ReleasePredicateLocks(bool isCommit)
+  *
+  * conflict detection (may also trigger rollback)
+  *		CheckForSerializableConflictOut(bool valid, Relation relation,
+  *										HeapTupleData *tup, Buffer buffer)
+  *		CheckForSerializableConflictIn(Relation relation, HeapTupleData *tup,
+  *									   Buffer buffer)
+  *
+  * final rollback checking
+  *		PreCommit_CheckForSerializationFailure(void)
+  */
+ 
+ #include "postgres.h"
+ 
+ #include "access/transam.h"
+ #include "access/twophase.h"
+ #include "access/xact.h"
+ #include "miscadmin.h"
+ #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
+ #include "utils/rel.h"
+ #include "utils/snapmgr.h"
+ 
+ /*
+  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
+  * transaction or any of its subtransactions.
+  */
+ typedef struct SERIALIZABLEXIDTAG
+ {
+ 	TransactionId xid;
+ } SERIALIZABLEXIDTAG;
+ 
+ /*
+  * Information to link between an xid list and a top level serializable transaction.
+  */
+ typedef struct SERIALIZABLEXID
+ {
+ 	/* hash key */
+ 	SERIALIZABLEXIDTAG tag;
+ 
+ 	/* data */
+ 	SERIALIZABLEXACT *myXact;	/* pointer to the top level transaction data */
+ 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+ 								 * xids */
+ } SERIALIZABLEXID;
+ 
+ /*
+  * Per-locked-object predicate lock information:
+  *
+  * tag -- uniquely identifies the object being locked
+  * predicateLocks -- list of predicate lock objects for this target.
+  */
+ typedef struct PREDICATELOCKTARGET
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
+ 								 * predicate lock target */
+ } PREDICATELOCKTARGET;
+ 
+ typedef struct PREDICATELOCKTAG
+ {
+ 	PREDICATELOCKTARGET *myTarget;
+ 	SERIALIZABLEXACT *myXact;
+ } PREDICATELOCKTAG;
+ 
+ typedef struct PREDICATELOCK
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTAG tag;		/* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	SHM_QUEUE	targetLink;		/* list link in PREDICATELOCKTARGET's list of
+ 								 * predicate locks */
+ 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+ 								 * predicate locks */
+ } PREDICATELOCK;
+ 
+ /*
+  * Backend-local hash table of ancestor (coarser) locks and the number
+  * of (finer-grained) children locks that are currently held. This is
+  * used to determine when to promote multiple fine-grained locks to
+  * one coarse-grained lock.
+  */
+ typedef struct LOCALPREDICATELOCK
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	bool		held;			/* is lock held, or just its children?	*/
+ 	int			childLocks;		/* number of child locks currently held */
+ } LOCALPREDICATELOCK;
+ static HTAB *LocalPredicateLockHash = NULL;
+ 
+ 
+ /*
+  * Test the most selective fields first, for performance.
+  *
+  * a is covered by b if all of the following hold:
+  *	1) a.database = b.database
+  *	2) a.relation = b.relation
+  *	3) b.offset is invalid (b is page-granularity or higher)
+  *	4) either of the following:
+  *		4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+  *	 or 4b) a.offset is invalid and b.page is invalid (a is
+  *			page-granularity and b is relation-granularity
+  */
+ #define TargetTagIsCoveredBy(covered_target, covering_target)			\
+ 	((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */	\
+ 	  GET_PREDICATELOCKTARGETTAG_RELATION(covering_target))				\
+ 	 && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) ==			\
+ 		 InvalidOffsetNumber)								 /* (3) */	\
+ 	 && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) !=			\
+ 		   InvalidOffsetNumber)								 /* (4a) */ \
+ 		  && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+ 			  GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)))			\
+ 		 || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+ 			  InvalidBlockNumber)							 /* (4b) */ \
+ 			 && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)		\
+ 				 != InvalidBlockNumber)))								\
+ 	 && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) ==	 /* (1) */	\
+ 		 GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+ 
+ /*
+  * The predicate locking target and lock shared hash tables are partitioned to
+  * reduce contention.  To determine which partition a given target belongs to,
+  * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+  * apply one of these macros.
+  * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+  */
+ #define PredicateLockHashPartition(hashcode) \
+ 	((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+ #define PredicateLockHashPartitionLock(hashcode) \
+ 	((LWLockId) (FirstPredicateLockMgrLock + PredicateLockHashPartition(hashcode)))
+ 
+ #define NPREDICATELOCKTARGETENTS() \
+ 	mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+ 
+ #define SxactIsCommitted(sxact) TransactionIdIsValid((sxact)->finishedBefore)
+ #define SxactCommittedBefore(sxactPivotOut, sxactOther) \
+ 	((!TransactionIdIsValid((sxactOther)->finishedBefore)) \
+ 	|| TransactionIdPrecedesOrEquals((sxactPivotOut)->finishedBefore, (sxactOther)->finishedBefore))
+ 
+ /*
+  * When a public interface method is called for a split on an index relation,
+  * this is the test to see if we should do a quick return.
+  */
+ #define SkipSplitTracking(relation) \
+ 	(((relation)->rd_id < FirstBootstrapObjectId) \
+ 	|| ((relation)->rd_istemp))
+ 
+ /*
+  * When a public interface method is called for serializing a relation within
+  * the current transaction, this is the test to see if we should do a quick return.
+  */
+ #define SkipSerialization(relation) \
+ 	((!IsXactIsoLevelFullySerializable) \
+ 	|| SkipSplitTracking(relation))
+ 
+ 
+ /*
+  * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+  *
+  * To avoid unnecessary recomputations of the hash code, we try to do this
+  * just once per function, and then pass it around as needed.  Aside from
+  * passing the hashcode to hash_search_with_hash_value(), we can extract
+  * the lock partition number from the hashcode.
+  */
+ #define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+ 	(tag_hash((predicatelocktargettag), sizeof(PREDICATELOCKTARGETTAG)))
+ 
+ /*
+  * Given a predicate lock tag, and the hash for its target,
+  * compute the lock hash.
+  *
+  * To make the hash code also depend on the transaction, we xor the sxid
+  * struct's address into the hash code, left-shifted so that the
+  * partition-number bits don't change.  Since this is only a hash, we
+  * don't care if we lose high-order bits of the address; use an
+  * intermediate variable to suppress cast-pointer-to-int warnings.
+  */
+ #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+ 	((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+ 
+ 
+ /* This configuration variable is used to set the predicate lock table size */
+ int			max_predicate_locks_per_xact;		/* set by guc.c */
+ 
+ /*
+  * These global variables are maintained when registering and cleaning up
+  * serializable transactions.  They must be global across all backends, but
+  * are not needed outside this source file, so no .h declaration is needed.
+  */
+ TransactionId SerializableGlobalXmin = InvalidTransactionId;
+ int			SerializableGlobalXminCount = 0;
+ 
+ /*
+  * The predicate locking hash tables are in shared memory.
+  * Each backend keeps pointers to them.
+  */
+ static HTAB *SerializableXactHash;
+ static HTAB *SerializableXidHash;
+ static HTAB *PredicateLockTargetHash;
+ static HTAB *PredicateLockHash;
+ static SHM_QUEUE *FinishedSerializableTransactions;
+ 
+ /*
+  * Keep a pointer to the currently-running serializable transaction (if any)
+  * for quick reference.
+  */
+ typedef SERIALIZABLEXACT *SERIALIZABLEXACTPtr;
+ 
+ #define InvalidSerializableXact ((SERIALIZABLEXACTPtr) NULL)
+ static volatile SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+ 
+ /* TODO SSI: Remove volatile qualifier and the then-unnecessary casts? */
+ 
+ /* The most recently used xid within this transaction, for optimizations. */
+ static TransactionId MyXid = InvalidTransactionId;
+ 
+ 
+ /* local functions */
+ static uint32 predicatelock_hash(const void *key, Size keysize);
+ static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact);
+ static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *newtargettag);
+ static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+ static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *targettag);
+ static int	PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag);
+ static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ 						  PREDICATELOCKTARGETTAG *parent);
+ static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+ static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *tag);
+ static void EnsureMySerializableXidExists(void);
+ static void ClearOldPredicateLocks(void);
+ static bool XidIsConcurrent(TransactionId xid);
+ static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+ static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+ static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
+ 
+ /*
+  * InitPredicateLocks -- Initialize the predicate locking data structures.
+  *
+  * This is called from CreateSharedMemoryAndSemaphores(), which see for
+  * more comments.  In the normal postmaster case, the shared hash tables
+  * are created here.  Backends inherit the pointers
+  * to the shared tables via fork().  In the EXEC_BACKEND case, each
+  * backend re-executes this code to obtain pointers to the already existing
+  * shared hash tables.
+  */
+ void
+ InitPredicateLocks(void)
+ {
+ 	HASHCTL		info;
+ 	int			hash_flags;
+ 	long		init_table_size,
+ 				max_table_size;
+ 	bool		found;
+ 
+ 	/*
+ 	 * Compute init/max size to request for predicate lock target hashtable.
+ 	 * Note these calculations must agree with PredicateLockShmemSize!
+ 	 */
+ 	max_table_size = NPREDICATELOCKTARGETENTS();
+ 	init_table_size = max_table_size / 2;
+ 
+ 	/*
+ 	 * Allocate hash table for PREDICATELOCKTARGET structs.  This stores
+ 	 * per-predicate-lock-target information.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ 	info.entrysize = sizeof(PREDICATELOCKTARGET);
+ 	info.hash = tag_hash;
+ 	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ 
+ 	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+ 											init_table_size,
+ 											max_table_size,
+ 											&info,
+ 											hash_flags);
+ 
+ 	/* Assume an average of 2 xacts per target */
+ 	max_table_size *= 2;
+ 	init_table_size *= 2;
+ 
+ 	/*
+ 	 * Allocate hash table for PREDICATELOCK structs.  This stores per
+ 	 * xact-lock-of-a-target information.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(PREDICATELOCKTAG);
+ 	info.entrysize = sizeof(PREDICATELOCK);
+ 	info.hash = predicatelock_hash;
+ 	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ 
+ 	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+ 									  init_table_size,
+ 									  max_table_size,
+ 									  &info,
+ 									  hash_flags);
+ 
+ 	/*
+ 	 * Compute init/max size to request for serializable transaction
+ 	 * hashtable. Note these calculations must agree with
+ 	 * PredicateLockShmemSize!
+ 	 */
+ 	max_table_size = MaxBackends;
+ 	init_table_size = max_table_size / 2;
+ 
+ 	/*
+ 	 * Allocate hash table for SERIALIZABLEXACT structs.  This stores per-vxid
+ 	 * information for serializable transactions which have accessed data.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(SERIALIZABLEXACTTAG);
+ 	info.entrysize = sizeof(SERIALIZABLEXACT);
+ 	info.hash = tag_hash;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION);
+ 
+ 	SerializableXactHash = ShmemInitHash("SERIALIZABLEXACT hash",
+ 										 init_table_size,
+ 										 max_table_size,
+ 										 &info,
+ 										 hash_flags);
+ 
+ 	/* Assume an average of 10 serializable xids per backend. */
+ 	max_table_size *= 10;
+ 	init_table_size *= 10;
+ 
+ 	/*
+ 	 * Allocate hash table for SERIALIZABLEXID structs.  This stores per-xid
+ 	 * information for serializable transactions which have accessed data.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(SERIALIZABLEXIDTAG);
+ 	info.entrysize = sizeof(SERIALIZABLEXID);
+ 	info.hash = tag_hash;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION);
+ 
+ 	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+ 										init_table_size,
+ 										max_table_size,
+ 										&info,
+ 										hash_flags);
+ 
+ 	/*
+ 	 * Create or attach to the header for the list of finished serializable
+ 	 * transactions.
+ 	 */
+ 	FinishedSerializableTransactions = (SHM_QUEUE *)
+ 		ShmemInitStruct("FinishedSerializableTransactions",
+ 						sizeof(SHM_QUEUE),
+ 						&found);
+ 	if (!found)
+ 		SHMQueueInit(FinishedSerializableTransactions);
+ }
+ 
+ /*
+  * Estimate shared-memory space used for predicate lock table
+  */
+ Size
+ PredicateLockShmemSize(void)
+ {
+ 	Size		size = 0;
+ 	long		max_table_size;
+ 
+ 	/* predicate lock target hash table */
+ 	max_table_size = NPREDICATELOCKTARGETENTS();
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(PREDICATELOCKTARGET)));
+ 
+ 	/* predicate lock hash table */
+ 	max_table_size *= 2;
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(PREDICATELOCK)));
+ 
+ 	/*
+ 	 * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+ 	 * margin.
+ 	 */
+ 	size = add_size(size, size / 10);
+ 
+ 	/* serializable transaction table */
+ 	max_table_size = MaxBackends;
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(SERIALIZABLEXACT)));
+ 
+ 	/* serializable subtransaction table */
+ 	max_table_size *= 10;
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(SERIALIZABLEXID)));
+ 
+ 	/* Head for list of serializable transactions. */
+ 	size = add_size(size, sizeof(SHM_QUEUE));
+ 
+ 	return size;
+ }
+ 
+ 
+ /*
+  * Compute the hash code associated with a PREDICATELOCKTAG.
+  *
+  * Because we want to use just one set of partition locks for both the
+  * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+  * that PREDICATELOCKs fall into the same partition number as their
+  * associated PREDICATELOCKTARGETs.  dynahash.c expects the partition number
+  * to be the low-order bits of the hash code, and therefore a
+  * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+  * associated PREDICATELOCKTARGETTAG's hash code.  We achieve this with this
+  * specialized hash function.
+  */
+ static uint32
+ predicatelock_hash(const void *key, Size keysize)
+ {
+ 	const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+ 	uint32		targethash;
+ 
+ 	Assert(keysize == sizeof(PREDICATELOCKTAG));
+ 
+ 	/* Look into the associated target object, and compute its hash code */
+ 	targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+ 
+ 	return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+ }
+ 
+ 
+ /*
+  * GetPredicateLockStatusData
+  *		Return a table containing the internal state of the predicate
+  *		lock manager for use in pg_lock_status.
+  *
+  * Like GetLockStatusData, this function tries to hold the partition LWLocks
+  * for as short a time as possible by returning two arrays that simply
+  * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+  * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+  * SERIALIZABLEXACT will likely appear.
+  */
+ PredicateLockData *
+ GetPredicateLockStatusData(void)
+ {
+ 	PredicateLockData *data;
+ 	int			i;
+ 	int			els,
+ 				el;
+ 	HASH_SEQ_STATUS seqstat;
+ 	PREDICATELOCK *predlock;
+ 
+ 	data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+ 
+ 	/*
+ 	 * Acquire locks. To ensure consistency, take simultaneous locks on
+ 	 * SerializableFinishedListLock, all partition locks in ascending order,
+ 	 * then SerializableXactHashLock.
+ 	 */
+ 	LWLockAcquire(SerializableFinishedListLock, LW_SHARED);
+ 	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ 		LWLockAcquire(FirstPredicateLockMgrLock + i, LW_SHARED);
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 
+ 	/* Get number of locks and allocate appropriately-sized arrays. */
+ 	els = hash_get_num_entries(PredicateLockHash);
+ 	data->nelements = els;
+ 	data->locktags = (PREDICATELOCKTARGETTAG *)
+ 		palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+ 	data->xacts = (SERIALIZABLEXACT *)
+ 		palloc(sizeof(SERIALIZABLEXACT) * els);
+ 
+ 
+ 	/* Scan through PredicateLockHash and copy contents */
+ 	hash_seq_init(&seqstat, PredicateLockHash);
+ 
+ 	el = 0;
+ 
+ 	while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+ 	{
+ 		data->locktags[el] = predlock->tag.myTarget->tag;
+ 		data->xacts[el] = *predlock->tag.myXact;
+ 		el++;
+ 	}
+ 
+ 	Assert(el == els);
+ 
+ 	/* Release locks in reverse order */
+ 	LWLockRelease(SerializableXactHashLock);
+ 	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ 		LWLockRelease(FirstPredicateLockMgrLock + i);
+ 	LWLockRelease(SerializableFinishedListLock);
+ 
+ 	return data;
+ }
+ 
+ 
+ /*
+  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+  * It should be current for this process and be contained in
+  * SerializableXactHash.
+  */
+ void
+ RegisterSerializableTransaction(const Snapshot snapshot)
+ {
+ 	PGPROC	   *proc;
+ 	SERIALIZABLEXACTTAG sxacttag;
+ 	SERIALIZABLEXACT *sxact;
+ 	bool		found;
+ 	HASHCTL		hash_ctl;
+ 
+ 	/* We only do this for serializable transactions.  Once. */
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 	Assert(MySerializableXact == InvalidSerializableXact);
+ 
+ 	proc = MyProc;
+ 	Assert(proc != NULL);
+ 	GET_VXID_FROM_PGPROC(sxacttag.vxid, *proc);
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	if (!TransactionIdIsValid(SerializableGlobalXmin))
+ 	{
+ 		Assert(SerializableGlobalXminCount == 0);
+ 		SerializableGlobalXmin = snapshot->xmin;
+ 		SerializableGlobalXminCount = 1;
+ 	}
+ 	else if (SerializableGlobalXmin == snapshot->xmin)
+ 	{
+ 		Assert(SerializableGlobalXminCount > 0);
+ 		SerializableGlobalXminCount++;
+ 	}
+ 	else
+ 	{
+ 		Assert(TransactionIdFollows(snapshot->xmin, SerializableGlobalXmin));
+ 	}
+ 	sxact = (SERIALIZABLEXACT *) hash_search(SerializableXactHash,
+ 											 &sxacttag,
+ 											 HASH_ENTER, &found);
+ 	Assert(!found);
+ 	if (!sxact)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 	/* Initialize the structure. */
+ 	sxact->outConflict = InvalidSerializableXact;
+ 	sxact->inConflict = InvalidSerializableXact;
+ 	sxact->topXid = GetTopTransactionIdIfAny();
+ 	sxact->finishedBefore = InvalidTransactionId;
+ 	sxact->xmin = snapshot->xmin;
+ 	SHMQueueInit(&(sxact->predicateLocks));
+ 	SHMQueueInit(&(sxact->xids));
+ 	SHMQueueElemInit(&(sxact->finishedLink));
+ 	sxact->rolledBack = false;
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	MySerializableXact = sxact;
+ 
+ 	/* Initialized the backend-local hash table of parent locks */
+ 	Assert(LocalPredicateLockHash == NULL);
+ 	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ 	hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ 	hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+ 	hash_ctl.hash = tag_hash;
+ 	LocalPredicateLockHash = hash_create("Local predicate lock",
+ 										 max_predicate_locks_per_xact,
+ 										 &hash_ctl,
+ 										 HASH_ELEM | HASH_FUNCTION);
+ }
+ 
+ /*
+  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+  * It should be current for this process and be contained in SerializableXidHash.
+  */
+ static void
+ EnsureMySerializableXidExists(void)
+ {
+ 	TransactionId xid;
+ 
+ 	Assert(MySerializableXact != InvalidSerializableXact);
+ 
+ 	MySerializableXact->topXid = GetTopTransactionIdIfAny();
+ 
+ 	/*
+ 	 * If this isn't the xid we've most recently seen for this vxid, make sure
+ 	 * it's in the hash table.
+ 	 */
+ 	xid = GetCurrentTransactionIdIfAny();
+ 	if (MyXid != xid)
+ 	{
+ 		SERIALIZABLEXIDTAG sxidtag;
+ 		SERIALIZABLEXID *sxid;
+ 		bool		found;
+ 
+ 		Assert(TransactionIdIsValid(xid));
+ 
+ 		sxidtag.xid = xid;
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ 											   &sxidtag,
+ 											   HASH_ENTER, &found);
+ 		if (!sxid)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of shared memory"),
+ 					 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 		/* Initialize the structure. */
+ 		if (!found)
+ 		{
+ 			sxid->myXact = (SERIALIZABLEXACT *) MySerializableXact;
+ 			SHMQueueInsertBefore(&(((SERIALIZABLEXACT *) MySerializableXact)->xids),
+ 								 &(sxid->xactLink));
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 		MyXid = xid;
+ 	}
+ }
+ 
+ 
+ /*
+  * Check whether a particular lock is held by this transaction.
+  */
+ static bool
+ PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	LOCALPREDICATELOCK *lock;
+ 
+ 	/* check local hash table */
+ 	lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ 											  targettag,
+ 											  HASH_FIND, NULL);
+ 
+ 	if (!lock)
+ 		return false;
+ 
+ 	/*
+ 	 * Found entry in the table, but still need to check whether it's actually
+ 	 * held -- it could just be a parent of some held lock.
+ 	 */
+ 	return lock->held;
+ }
+ 
+ /*
+  * Return the parent lock tag in the lock hierarchy: the next coarser
+  * lock that covers the provided tag.
+  *
+  * Returns true and sets *parent to the parent tag if one exists,
+  * returns false if none exists.
+  */
+ static bool
+ GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ 						  PREDICATELOCKTARGETTAG *parent)
+ {
+ 	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ 	{
+ 		case PREDLOCKTAG_RELATION:
+ 			/* relation locks have no parent lock */
+ 			return false;
+ 
+ 		case PREDLOCKTAG_PAGE:
+ 			/* parent lock is relation lock */
+ 			SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+ 										 GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ 								  GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+ 
+ 			return true;
+ 
+ 		case PREDLOCKTAG_TUPLE:
+ 			/* parent lock is page lock */
+ 			SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+ 										 GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ 								   GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+ 									  GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+ 			return true;
+ 	}
+ 
+ 	/* not reachable */
+ 	Assert(false);
+ 	return false;
+ }
+ 
+ /*
+  * Check whether the lock we are considering is already covered by a
+  * coarser lock for our transaction.
+  */
+ static bool
+ CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+ {
+ 	PREDICATELOCKTARGETTAG targettag,
+ 				parenttag;
+ 
+ 	targettag = *newtargettag;
+ 
+ 	/* check parents iteratively until no more */
+ 	while (GetParentPredicateLockTag(&targettag, &parenttag))
+ 	{
+ 		targettag = parenttag;
+ 		if (PredicateLockExists(&targettag))
+ 			return true;
+ 	}
+ 
+ 	/* no more parents to check; lock is not covered */
+ 	return false;
+ }
+ 
+ 
+ /*
+  * Delete child target locks owned by this process.
+  * This implementation is assuming that the usage of each target tag field
+  * is uniform.	No need to make this hard if we don't have to.
+  *
+  * We aren't acquiring lightweight locks for the predicate lock or lock
+  * target structures associated with this transaction unless we're going
+  * to modify them, because no other process is permitted to modify our
+  * locks.
+  */
+ static void
+ DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+ {
+ 	SERIALIZABLEXACT *sxact;
+ 	PREDICATELOCK *predlock;
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	sxact = (SERIALIZABLEXACT *) MySerializableXact;
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(sxact->predicateLocks),
+ 					 &(sxact->predicateLocks),
+ 					 offsetof(PREDICATELOCK, xactLink));
+ 	while (predlock)
+ 	{
+ 		SHM_QUEUE  *predlocksxactlink;
+ 		PREDICATELOCK *nextpredlock;
+ 		PREDICATELOCKTAG oldlocktag;
+ 		PREDICATELOCKTARGET *oldtarget;
+ 		PREDICATELOCKTARGETTAG oldtargettag;
+ 
+ 		predlocksxactlink = &(predlock->xactLink);
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(sxact->predicateLocks),
+ 						 predlocksxactlink,
+ 						 offsetof(PREDICATELOCK, xactLink));
+ 
+ 		oldlocktag = predlock->tag;
+ 		Assert(oldlocktag.myXact == sxact);
+ 		oldtarget = oldlocktag.myTarget;
+ 		oldtargettag = oldtarget->tag;
+ 
+ 		if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+ 		{
+ 			uint32		oldtargettaghash;
+ 			LWLockId	partitionLock;
+ 			PREDICATELOCK *rmpredlock;
+ 			PREDICATELOCKTARGET *rmtarget;
+ 
+ 			oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 			partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 
+ 			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 
+ 			SHMQueueDelete(predlocksxactlink);
+ 			SHMQueueDelete(&(predlock->targetLink));
+ 			rmpredlock = hash_search_with_hash_value(PredicateLockHash,
+ 													 &oldlocktag,
+ 													 PredicateLockHashCodeFromTargetHashCode(&oldlocktag, oldtargettaghash),
+ 													 HASH_REMOVE, NULL);
+ 			Assert(rmpredlock == predlock);
+ 
+ 			if (SHMQueueEmpty(&oldtarget->predicateLocks))
+ 			{
+ 				rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 													   &oldtargettag,
+ 													   oldtargettaghash,
+ 													   HASH_REMOVE, NULL);
+ 				Assert(rmtarget == oldtarget);
+ 			}
+ 
+ 			LWLockRelease(partitionLock);
+ 
+ 			DecrementParentLocks(&oldtargettag);
+ 		}
+ 
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  * Returns the promotion threshold for a given predicate lock
+  * target. This is the number of descendant locks required to promote
+  * to the specified tag. Note that the threshold includes non-direct
+  * descendants, e.g. both tuples and pages for a relation lock.
+  *
+  * TODO SSI: We should do something more intelligent about what the
+  * thresholds are, either making it proportional to the number of
+  * tuples in a page & pages in a relation, or at least making it a
+  * GUC. Currently the threshold is 3 for a page lock, and
+  * max_predicate_locks_per_transaction/2 for a relation lock, chosen
+  * entirely arbitrarily (and without benchmarking).
+  */
+ static int
+ PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag)
+ {
+ 	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ 	{
+ 		case PREDLOCKTAG_RELATION:
+ 			return max_predicate_locks_per_xact / 2;
+ 
+ 		case PREDLOCKTAG_PAGE:
+ 			return 3;
+ 
+ 		case PREDLOCKTAG_TUPLE:
+ 
+ 			/*
+ 			 * not reachable: nothing is finer-granularity than a tuple, so we
+ 			 * should never try to promote to it.
+ 			 */
+ 			Assert(false);
+ 			return 0;
+ 	}
+ 
+ 	/* not reachable */
+ 	Assert(false);
+ 	return 0;
+ }
+ 
+ /*
+  * For all ancestors of a newly-acquired predicate lock, increment
+  * their child count in the parent hash table. If any of them have
+  * more descendants than their promotion threshold, acquire the
+  * coarsest such lock.
+  *
+  * Returns true if a parent lock was acquired and false otherwise.
+  */
+ static bool
+ CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+ {
+ 	PREDICATELOCKTARGETTAG targettag,
+ 				nexttag,
+ 				promotiontag;
+ 	LOCALPREDICATELOCK *parentlock;
+ 	bool		found,
+ 				promote;
+ 
+ 	promote = false;
+ 
+ 	targettag = *reqtag;
+ 
+ 	/* check parents iteratively */
+ 	while (GetParentPredicateLockTag(&targettag, &nexttag))
+ 	{
+ 		targettag = nexttag;
+ 		parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ 														&targettag,
+ 														HASH_ENTER,
+ 														&found);
+ 		if (!found)
+ 		{
+ 			parentlock->held = false;
+ 			parentlock->childLocks = 1;
+ 		}
+ 		else
+ 			parentlock->childLocks++;
+ 
+ 		if (parentlock->childLocks >=
+ 			PredicateLockPromotionThreshold(&targettag))
+ 		{
+ 			/*
+ 			 * We should promote to this parent lock. Continue to check its
+ 			 * ancestors, however, both to get their child counts right and to
+ 			 * check whether we should just go ahead and promote to one of
+ 			 * them.
+ 			 */
+ 			promotiontag = targettag;
+ 			promote = true;
+ 		}
+ 	}
+ 
+ 	if (promote)
+ 	{
+ 		/* acquire coarsest ancestor eligible for promotion */
+ 		PredicateLockAcquire(&promotiontag);
+ 		return true;
+ 	}
+ 	else
+ 		return false;
+ }
+ 
+ /*
+  * When releasing a lock, decrement the child count on all ancestor
+  * locks.
+  *
+  * This is called only when releasing a lock via
+  * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+  * we've acquired its parent, possibly due to promotion) or when a new
+  * MVCC write lock makes the predicate lock unnecessary. There's no
+  * point in calling it when locks are released at transaction end, as
+  * this information is no longer needed.
+  */
+ static void
+ DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	PREDICATELOCKTARGETTAG parenttag,
+ 				nexttag;
+ 
+ 	parenttag = *targettag;
+ 
+ 	while (GetParentPredicateLockTag(&parenttag, &nexttag))
+ 	{
+ 		uint32		targettaghash;
+ 		LOCALPREDICATELOCK *parentlock,
+ 				   *rmlock;
+ 
+ 		parenttag = nexttag;
+ 		targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+ 		parentlock = (LOCALPREDICATELOCK *)
+ 			hash_search_with_hash_value(LocalPredicateLockHash,
+ 										&parenttag, targettaghash,
+ 										HASH_FIND, NULL);
+ 		Assert(parentlock != NULL);
+ 		parentlock->childLocks--;
+ 
+ 		Assert(parentlock->childLocks >= 0);
+ 
+ 		if ((parentlock->childLocks == 0) && (!parentlock->held))
+ 		{
+ 			rmlock = (LOCALPREDICATELOCK *)
+ 				hash_search_with_hash_value(LocalPredicateLockHash,
+ 											&parenttag, targettaghash,
+ 											HASH_REMOVE, NULL);
+ 			Assert(rmlock == parentlock);
+ 		}
+ 	}
+ }
+ 
+ /*
+  * Acquire a predicate lock on the specified target for the current
+  * connection if not already held.	Create related serializable transaction
+  * and predicate lock target entries first if missing.
+  */
+ static void
+ PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	uint32		targettaghash;
+ 	LWLockId	partitionLock;
+ 	bool		found;
+ 	PREDICATELOCKTARGET *target;
+ 	PREDICATELOCKTAG locktag;
+ 	PREDICATELOCK *lock;
+ 	LOCALPREDICATELOCK *locallock;
+ 
+ 	EnsureMySerializableXidExists();
+ 
+ 	/* Do we have the lock already, or a covering lock? */
+ 	if (PredicateLockExists(targettag))
+ 		return;
+ 
+ 	if (CoarserLockCovers(targettag))
+ 		return;
+ 
+ 	/* the same hash and LW lock apply to the lock target and the local lock. */
+ 	targettaghash = PredicateLockTargetTagHashCode(targettag);
+ 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 
+ 	/* Acquire lock in local table */
+ 	locallock = (LOCALPREDICATELOCK *)
+ 		hash_search_with_hash_value(LocalPredicateLockHash,
+ 									targettag, targettaghash,
+ 									HASH_ENTER, &found);
+ 	/* We should not hold the lock (but its entry might still exist) */
+ 	Assert(!found || !locallock->held);
+ 	locallock->held = true;
+ 	if (!found)
+ 		locallock->childLocks = 0;
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 
+ 	/* Make sure that the target is represented. */
+ 	target = (PREDICATELOCKTARGET *)
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									targettag, targettaghash,
+ 									HASH_ENTER, &found);
+ 	if (!target)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 	if (!found)
+ 		SHMQueueInit(&(target->predicateLocks));
+ 
+ 	/* We've got the sxact and target, make sure they're joined. */
+ 	locktag.myTarget = target;
+ 	locktag.myXact = (SERIALIZABLEXACT *) MySerializableXact;
+ 	lock = (PREDICATELOCK *)
+ 		hash_search_with_hash_value(PredicateLockHash, &locktag,
+ 			PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+ 									HASH_ENTER, &found);
+ 	if (!lock)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 	if (!found)
+ 	{
+ 		SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink));
+ 		SHMQueueInsertBefore((SHM_QUEUE *) &(MySerializableXact->predicateLocks), &(lock->xactLink));
+ 	}
+ 
+ 	LWLockRelease(partitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 	/*
+ 	 * Lock has been acquired. Check whether it should be promoted to a
+ 	 * coarser granularity, or whether there are finer-granularity locks to
+ 	 * clean up.
+ 	 */
+ 	if (CheckAndPromotePredicateLockRequest(targettag))
+ 	{
+ 		/*
+ 		 * Lock request was promoted to a coarser-granularity lock, and that
+ 		 * lock was acquired. It will delete this lock and any of its
+ 		 * children, so we're done.
+ 		 */
+ 	}
+ 	else
+ 	{
+ 		/* Clean up any finer-granularity locks */
+ 		if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+ 			DeleteChildTargetLocks(targettag);
+ 	}
+ }
+ 
+ 
+ /*
+  *		PredicateLockRelation
+  *
+  * Gets a predicate lock at the relation level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  * Clear any finer-grained predicate locks this session has on the relation.
+  */
+ void
+ PredicateLockRelation(const Relation relation)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id);
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockPage
+  *
+  * Gets a predicate lock at the page level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  * Skip if a coarser predicate lock already covers this page.
+  * Clear any finer-grained predicate locks this session has on the relation.
+  */
+ void
+ PredicateLockPage(const Relation relation, const BlockNumber blkno)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									blkno);
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockTuple
+  *
+  * Gets a predicate lock at the tuple level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  */
+ void
+ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 	ItemPointer tid;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	/*
+ 	 * If it's a heap tuple, return if this xact wrote it.  It might be useful
+ 	 * to pass in the xmin from the tuple as another parameter.
+ 	 */
+ 	if (relation->rd_index == NULL)
+ 	{
+ 		SERIALIZABLEXIDTAG sxidtag;
+ 		SERIALIZABLEXID *sxid;
+ 
+ 		sxidtag.xid = HeapTupleHeaderGetXmin(tuple->t_data);
+ 		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		sxid = (SERIALIZABLEXID *)
+ 			hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ 		if (sxid)
+ 		{
+ 			if (sxid->myXact == MySerializableXact)
+ 			{
+ 				/* We wrote it; we already have a write lock. */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				return;
+ 			}
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 	}
+ 
+ 	tid = &(tuple->t_self);
+ 	SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+ 									 relation->rd_node.dbNode,
+ 									 relation->rd_id,
+ 									 ItemPointerGetBlockNumber(tid),
+ 									 ItemPointerGetOffsetNumber(tid));
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockPageSplit
+  *
+  * Copies any predicate locks for the old page to the new page.
+  * Skip if this is a temporary table or toast table.
+  *
+  * NOTE: A page split (or overflow) affects all serializable transactions,
+  * even if it occurrs in the context of another transaction isolation level.
+  *
+  * NOTE: This currently leaves the local copy of the locks without
+  * information on the new lock which is in shared memory.  This could cause
+  * problems if enough page splits occur on locked pages without the processes
+  * which hold the locks getting in and noticing.
+  */
+ void
+ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno)
+ {
+ 	PREDICATELOCKTARGETTAG oldtargettag;
+ 	PREDICATELOCKTARGETTAG newtargettag;
+ 	uint32		oldtargettaghash;
+ 	LWLockId	oldpartitionLock;
+ 	PREDICATELOCKTARGET *oldtarget;
+ 	uint32		newtargettaghash;
+ 	LWLockId	newpartitionLock;
+ 
+ 	if (SkipSplitTracking(relation))
+ 		return;
+ 
+ 	Assert(oldblkno != newblkno);
+ 	Assert(BlockNumberIsValid(oldblkno));
+ 	Assert(BlockNumberIsValid(newblkno));
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									oldblkno);
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									newblkno);
+ 
+ 	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ 	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * We must get the partition locks in ascending sequence to avoid
+ 	 * deadlocks. If old and new partitions are the same, we must request the
+ 	 * lock only once.
+ 	 */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockAcquire(oldpartitionLock, LW_SHARED);
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(oldpartitionLock, LW_SHARED);
+ 	}
+ 	else
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Look for the old target.  If not found, that's OK; no predicate locks
+ 	 * are affected, so we can just clean up and return. If it does exist,
+ 	 * walk its list of predicate locks and create new ones for the new block
+ 	 * number.
+ 	 */
+ 	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 											&oldtargettag,
+ 											oldtargettaghash,
+ 											HASH_FIND, NULL);
+ 	if (oldtarget)
+ 	{
+ 		PREDICATELOCKTARGET *newtarget;
+ 		bool		found;
+ 		PREDICATELOCK *oldpredlock;
+ 		PREDICATELOCKTAG newpredlocktag;
+ 
+ 		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 												&newtargettag,
+ 												newtargettaghash,
+ 												HASH_ENTER, &found);
+ 		Assert(!found);
+ 		if (!newtarget)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of shared memory"),
+ 					 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 		SHMQueueInit(&(newtarget->predicateLocks));
+ 
+ 		newpredlocktag.myTarget = newtarget;
+ 
+ 		oldpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(oldtarget->predicateLocks),
+ 						 &(oldtarget->predicateLocks),
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		while (oldpredlock)
+ 		{
+ 			SHM_QUEUE  *predlocktargetlink;
+ 			PREDICATELOCK *nextpredlock;
+ 			PREDICATELOCK *newpredlock;
+ 
+ 			predlocktargetlink = &(oldpredlock->targetLink);
+ 			nextpredlock = (PREDICATELOCK *)
+ 				SHMQueueNext(&(oldtarget->predicateLocks),
+ 							 predlocktargetlink,
+ 							 offsetof(PREDICATELOCK, targetLink));
+ 			newpredlocktag.myXact = oldpredlock->tag.myXact;
+ 
+ 			newpredlock = (PREDICATELOCK *)
+ 				hash_search_with_hash_value(PredicateLockHash,
+ 											&newpredlocktag,
+ 											PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, newtargettaghash),
+ 											HASH_ENTER, &found);
+ 			if (!newpredlock)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_OUT_OF_MEMORY),
+ 						 errmsg("out of shared memory"),
+ 						 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 			Assert(!found);
+ 			SHMQueueInsertBefore(&(newtarget->predicateLocks), &(newpredlock->targetLink));
+ 			SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks), &(newpredlock->xactLink));
+ 
+ 			oldpredlock = nextpredlock;
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 	}
+ 
+ 	/* Release partition locks in reverse order of acquisition. */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockRelease(newpartitionLock);
+ 		LWLockRelease(oldpartitionLock);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockRelease(oldpartitionLock);
+ 		LWLockRelease(newpartitionLock);
+ 	}
+ 	else
+ 		LWLockRelease(newpartitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  *		PredicateLockPageCombine
+  *
+  * Combines predicate locks for two existing pages.
+  * Skip if this is a temporary table or toast table.
+  *
+  * NOTE: A page combine affects all serializable
+  * transactions, even if it occurrs in the context of another
+  * transaction isolation level.
+  */
+ void
+ PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno)
+ {
+ 	PREDICATELOCKTARGETTAG oldtargettag;
+ 	PREDICATELOCKTARGETTAG newtargettag;
+ 	uint32		oldtargettaghash;
+ 	LWLockId	oldpartitionLock;
+ 	PREDICATELOCKTARGET *oldtarget;
+ 	uint32		newtargettaghash;
+ 	LWLockId	newpartitionLock;
+ 
+ 	if (SkipSplitTracking(relation))
+ 		return;
+ 
+ 	Assert(oldblkno != newblkno);
+ 	Assert(BlockNumberIsValid(oldblkno));
+ 	Assert(BlockNumberIsValid(newblkno));
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									oldblkno);
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									newblkno);
+ 
+ 	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ 	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * We must get the partition locks in ascending sequence to avoid
+ 	 * deadlocks. If old and new partitions are the same, we must request the
+ 	 * lock only once.
+ 	 */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Look for the old target.  If not found, that's OK; no predicate locks
+ 	 * are affected, so we can just clean up and return. If it does exist,
+ 	 * walk its list of predicate locks and create new ones for the new block
+ 	 * number, while deleting the old ones.
+ 	 */
+ 	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 											&oldtargettag,
+ 											oldtargettaghash,
+ 											HASH_FIND, NULL);
+ 	if (oldtarget)
+ 	{
+ 		PREDICATELOCKTARGET *newtarget;
+ 		PREDICATELOCK *oldpredlock;
+ 		PREDICATELOCKTAG newpredlocktag;
+ 
+ 		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 												&newtargettag,
+ 												newtargettaghash,
+ 												HASH_FIND, NULL);
+ 		Assert(newtarget);
+ 
+ 		newpredlocktag.myTarget = newtarget;
+ 
+ 		oldpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(oldtarget->predicateLocks),
+ 						 &(oldtarget->predicateLocks),
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		while (oldpredlock)
+ 		{
+ 			SHM_QUEUE  *predlocktargetlink;
+ 			PREDICATELOCK *nextpredlock;
+ 			PREDICATELOCK *newpredlock;
+ 			bool		found;
+ 
+ 			predlocktargetlink = &(oldpredlock->targetLink);
+ 			nextpredlock = (PREDICATELOCK *)
+ 				SHMQueueNext(&(oldtarget->predicateLocks),
+ 							 predlocktargetlink,
+ 							 offsetof(PREDICATELOCK, targetLink));
+ 			newpredlocktag.myXact = oldpredlock->tag.myXact;
+ 
+ 			hash_search_with_hash_value(PredicateLockHash,
+ 										&oldpredlock->tag,
+ 										PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag, oldtargettaghash),
+ 										HASH_REMOVE, NULL);
+ 
+ 			newpredlock = (PREDICATELOCK *)
+ 				hash_search_with_hash_value(PredicateLockHash,
+ 											&newpredlocktag,
+ 											PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, newtargettaghash),
+ 											HASH_ENTER, &found);
+ 			if (!newpredlock)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_OUT_OF_MEMORY),
+ 						 errmsg("out of shared memory"),
+ 						 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 			if (!found)
+ 			{
+ 				SHMQueueInsertBefore(&(newtarget->predicateLocks), &(newpredlock->targetLink));
+ 				SHMQueueInsertBefore((SHM_QUEUE *) &(newpredlocktag.myXact->predicateLocks), &(newpredlock->xactLink));
+ 			}
+ 
+ 			oldpredlock = nextpredlock;
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 		Assert(SHMQueueIsDetached(&oldtarget->predicateLocks));
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									&oldtargettag,
+ 									oldtargettaghash,
+ 									HASH_REMOVE, NULL);
+ 	}
+ 
+ 	/* Release partition locks in reverse order of acquisition. */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockRelease(newpartitionLock);
+ 		LWLockRelease(oldpartitionLock);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockRelease(oldpartitionLock);
+ 		LWLockRelease(newpartitionLock);
+ 	}
+ 	else
+ 		LWLockRelease(newpartitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  *		ReleasePredicateLocks
+  *
+  * Releases predicate locks based on completion of the current
+  * transaction, whether committed or rolled back.
+  *
+  * We do nothing unless this is a serializable transaction.
+  *
+  * For a rollback, the current transaction's predicate locks could be
+  * immediately released; however, we may still have conflict pointers to
+  * our transaction which could be expensive to find and eliminate right
+  * now, so we flag it as rolled back so that it will be ignored, and let
+  * cleanup happen later.
+  *
+  * This method must ensure that shared memory hash tables are cleaned
+  * up in some relatively timely fashion.
+  *
+  * If this transaction is committing and is holding any predicate locks,
+  * it must be added to a list of completed serializable transaction still
+  * holding locks.
+  */
+ void
+ ReleasePredicateLocks(const bool isCommit)
+ {
+ 	if (!IsXactIsoLevelFullySerializable)
+ 		return;
+ 
+ 	/*
+ 	 * If this transaction has done anything we need to clean up, we'll have a
+ 	 * local pointer to the SERIALIZABLEXACT object.
+ 	 */
+ 	if (MySerializableXact != InvalidSerializableXact)
+ 	{
+ 		/* We'd better not already be on the cleanup list. */
+ 		Assert(SHMQueueIsDetached((SHM_QUEUE *) &MySerializableXact->finishedLink));
+ 
+ 		/*
+ 		 * If it's not a commit it's a rollback, and we can clear our locks
+ 		 * immediately.  TODO SSI: Clear the locks, but leave the sxact
+ 		 * record.
+ 		 */
+ 		if (!isCommit)
+ 			MySerializableXact->rolledBack = true;
+ 
+ 		/*
+ 		 * Add this to the list of transactions to check for later cleanup.
+ 		 * First turn pointers to already-terminated transactions to self-
+ 		 * references.
+ 		 */
+ 		LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+ 		if (MySerializableXact->inConflict != InvalidSerializableXact
+ 			&& SxactIsCommitted(MySerializableXact->inConflict))
+ 			MySerializableXact->inConflict = (SERIALIZABLEXACT *) MySerializableXact;
+ 		if (MySerializableXact->outConflict != InvalidSerializableXact
+ 			&& SxactIsCommitted(MySerializableXact->outConflict))
+ 			MySerializableXact->outConflict = (SERIALIZABLEXACT *) MySerializableXact;
+ 		SHMQueueInsertBefore(FinishedSerializableTransactions,
+ 						  (SHM_QUEUE *) &(MySerializableXact->finishedLink));
+ 		LWLockRelease(SerializableFinishedListLock);
+ 
+ 		/*
+ 		 * Check whether it's time to clean up old transactions. This can only
+ 		 * be done when the last serializable transaction with the oldest xmin
+ 		 * among serializable transactions completes.  We then find the "new
+ 		 * oldest" xmin and purge any transactions which finished before this
+ 		 * transaction was launched.
+ 		 */
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		if (MySerializableXact->xmin == SerializableGlobalXmin)
+ 		{
+ 			Assert(SerializableGlobalXminCount > 0);
+ 			if (--SerializableGlobalXminCount == 0)
+ 			{
+ 				/* We need to walk the hash table and find the new xmin. */
+ 				HASH_SEQ_STATUS seqstat;
+ 				SERIALIZABLEXACT *sxact;
+ 
+ 				SerializableGlobalXmin = InvalidTransactionId;
+ 				hash_seq_init(&seqstat, SerializableXactHash);
+ 				while ((sxact = (SERIALIZABLEXACT *) hash_seq_search(&seqstat)))
+ 				{
+ 					if (!SxactIsCommitted(sxact))
+ 					{
+ 						if (!TransactionIdIsValid(SerializableGlobalXmin)
+ 							|| TransactionIdPrecedes(sxact->xmin, SerializableGlobalXmin))
+ 						{
+ 							SerializableGlobalXmin = sxact->xmin;
+ 							SerializableGlobalXminCount = 1;
+ 						}
+ 						else if (sxact->xmin == SerializableGlobalXmin)
+ 							SerializableGlobalXminCount++;
+ 					}
+ 				}
+ 				/* Look for xids to clear. */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				ClearOldPredicateLocks();
+ 			}
+ 			else
+ 				LWLockRelease(SerializableXactHashLock);
+ 		}
+ 		else
+ 			LWLockRelease(SerializableXactHashLock);
+ 
+ 		MySerializableXact = InvalidSerializableXact;
+ 		MyXid = InvalidTransactionId;
+ 
+ 		/* Delete per-transaction lock table */
+ 		hash_destroy(LocalPredicateLockHash);
+ 		LocalPredicateLockHash = NULL;
+ 	}
+ }
+ 
+ /*
+  * Clear old predicate locks.
+  */
+ static void
+ ClearOldPredicateLocks(void)
+ {
+ 	SERIALIZABLEXACT *finishedSxact;
+ 	SHM_QUEUE  *link;
+ 
+ 	/*
+ 	 * If the lock is already held, skip it; another thread is probably
+ 	 * already on it. In any event, the next transaction to terminate will
+ 	 * have another shot at it.
+ 	 */
+ 	if (LWLockConditionalAcquire(SerializableFinishedListLock, LW_EXCLUSIVE))
+ 	{
+ 		finishedSxact = (SERIALIZABLEXACT *)
+ 			SHMQueueNext(FinishedSerializableTransactions,
+ 						 FinishedSerializableTransactions,
+ 						 offsetof(SERIALIZABLEXACT, finishedLink));
+ 		while (finishedSxact)
+ 		{
+ 			SERIALIZABLEXACT *nextSxact;
+ 
+ 			link = &(finishedSxact->finishedLink);
+ 			nextSxact = (SERIALIZABLEXACT *)
+ 				SHMQueueNext(FinishedSerializableTransactions,
+ 							 link,
+ 							 offsetof(SERIALIZABLEXACT, finishedLink));
+ 			if (!TransactionIdIsValid(SerializableGlobalXmin)
+ 			  || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+ 											   SerializableGlobalXmin))
+ 			{
+ 				SHMQueueDelete(link);
+ 				ReleaseOneSerializableXact(finishedSxact);
+ 			}
+ 			finishedSxact = nextSxact;
+ 		}
+ 		LWLockRelease(SerializableFinishedListLock);
+ 	}
+ }
+ 
+ /*
+  * This is the normal way to delete anything from any of the predicate
+  * locking hash tables.  Given a transaction which we know can be deleted,
+  * delete all predicate locks held by that transaction, and any predicate
+  * lock targets which are now unreferenced by a lock; delete all xid values
+  * for the transaction; then delete the transaction.
+  */
+ static void
+ ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact)
+ {
+ 	PREDICATELOCK *predlock;
+ 	SERIALIZABLEXID *sxid;
+ 
+ 	Assert(sxact != NULL);
+ 	Assert(sxact->rolledBack || SxactIsCommitted(sxact));
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(sxact->predicateLocks),
+ 					 &(sxact->predicateLocks),
+ 					 offsetof(PREDICATELOCK, xactLink));
+ 	while (predlock)
+ 	{
+ 		PREDICATELOCK *nextpredlock;
+ 		PREDICATELOCKTAG tag;
+ 		SHM_QUEUE  *targetLink;
+ 		PREDICATELOCKTARGET *target;
+ 		PREDICATELOCKTARGETTAG targettag;
+ 		uint32		targettaghash;
+ 		LWLockId	partitionLock;
+ 
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(sxact->predicateLocks),
+ 						 &(predlock->xactLink),
+ 						 offsetof(PREDICATELOCK, xactLink));
+ 
+ 		tag = predlock->tag;
+ 		targetLink = &(predlock->targetLink);
+ 		target = tag.myTarget;
+ 		targettag = target->tag;
+ 		targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ 		partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 
+ 		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 		SHMQueueDelete(targetLink);
+ 
+ 		/*
+ 		 * No need to do retail removal from transaction object; it's going
+ 		 * away.
+ 		 */
+ 		hash_search_with_hash_value(PredicateLockHash, &tag,
+ 								PredicateLockHashCodeFromTargetHashCode(&tag,
+ 															  targettaghash),
+ 									HASH_REMOVE, NULL);
+ 		if (SHMQueueEmpty(&target->predicateLocks))
+ 			hash_search_with_hash_value(PredicateLockTargetHash,
+ 							   &targettag, targettaghash, HASH_REMOVE, NULL);
+ 		LWLockRelease(partitionLock);
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 	/* Get rid of the xids and the record of the transaction itself. */
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	sxid = (SERIALIZABLEXID *)
+ 		SHMQueueNext(&(sxact->xids),
+ 					 &(sxact->xids),
+ 					 offsetof(SERIALIZABLEXID, xactLink));
+ 	while (sxid)
+ 	{
+ 		SERIALIZABLEXID *nextsxid;
+ 		SERIALIZABLEXIDTAG tag;
+ 
+ 		nextsxid = (SERIALIZABLEXID *)
+ 			SHMQueueNext(&(sxact->xids),
+ 						 &(sxid->xactLink),
+ 						 offsetof(SERIALIZABLEXID, xactLink));
+ 		tag = sxid->tag;
+ 		hash_search(SerializableXidHash, &tag, HASH_REMOVE, NULL);
+ 
+ 		/*
+ 		 * No need to do retail removal from transaction object; it's going
+ 		 * away.
+ 		 */
+ 		sxid = nextsxid;
+ 	}
+ 	Assert(SHMQueueIsDetached(&(sxact->finishedLink)));
+ 	hash_search(SerializableXactHash, &(sxact->tag), HASH_REMOVE, NULL);
+ 	LWLockRelease(SerializableXactHashLock);
+ }
+ 
+ /*
+  * Tests whether the given transaction is concurrent with (overlaps)
+  * our current transaction.
+  */
+ static bool
+ XidIsConcurrent(TransactionId xid)
+ {
+ 	Snapshot	snap;
+ 	uint32		i;
+ 
+ 	Assert(TransactionIdIsValid(xid));
+ 
+ 	/*
+ 	 * We don't count our own transaction or its subtransactions as
+ 	 * "concurrent".
+ 	 */
+ 	if (xid == GetTopTransactionIdIfAny())
+ 		return false;
+ 
+ 	snap = GetTransactionSnapshot();
+ 
+ 	if (TransactionIdPrecedes(xid, snap->xmin))
+ 		return false;
+ 
+ 	if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+ 		return true;
+ 
+ 	for (i = 0; i < snap->xcnt; i++)
+ 	{
+ 		if (xid == snap->xip[i])
+ 			return true;
+ 	}
+ 
+ 	return false;
+ }
+ 
+ /*
+  * CheckForSerializableConflictOut
+  *		We are reading a tuple which has been modified.  If it is visible to
+  *		us but has been deleted, that indicates a rw-conflict out.	If it's
+  *		not visible and was created by a concurrent (overlapping)
+  *		serializable transaction, that is also a rw-conflict out,
+  *
+  * The heap tables which we maintain for predicate locking will also be used
+  * to determine that the xmin from a row is related to a serializable
+  * transaction, and will provide a mapping to the top level transaction.
+  *
+  * This function should be called just about anywhere in heapam.c that a
+  * tuple has been read.
+  */
+ void
+ CheckForSerializableConflictOut(const bool valid, const Relation relation,
+ 								const HeapTuple tuple, const Buffer buffer)
+ {
+ 	TransactionId xid;
+ 	SERIALIZABLEXIDTAG sxidtag;
+ 	SERIALIZABLEXID *sxid;
+ 	SERIALIZABLEXACTTAG sxacttag;
+ 	SERIALIZABLEXACT *sxact;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	if (valid)
+ 	{
+ 		/*----------------------------------------------------------------
+ 		 * TODO SSI: Figure out why the ItemPointerIsValid test is needed.
+ 		 *			 We are sometimes failing with ip_posid == 0 in corner
+ 		 *			 cases, like the following.  Is this some underlying bug?
+ 		 *			 If not, is this the best way to handle this?
+ 		 *
+ 		 *	-- setup
+ 		 *	drop table ctl, receipt;
+ 		 *	create table ctl (k text not null primary key, deposit_date date not null);
+ 		 *	insert into ctl values ('receipt', date '2008-12-22');
+ 		 *	create table receipt (receipt_no int not null primary key, deposit_date date not null, amount numeric(13,2));
+ 		 *	insert into receipt values (1, (select deposit_date from ctl where k = 'receipt'), 1.00);
+ 		 *	insert into receipt values (2, (select deposit_date from ctl where k = 'receipt'), 2.00);
+ 		 *
+ 		 *	-- connection 1
+ 		 *	start transaction isolation level serializable ;
+ 		 *	insert into receipt values (3, (select deposit_date from ctl where k = 'receipt'), 4.00);
+ 		 *
+ 		 *	-- connection 2
+ 		 *	start transaction isolation level serializable ;
+ 		 *	update ctl set deposit_date = date '2008-12-23' where k = 'receipt';
+ 		 *
+ 		 *	-- connection 3
+ 		 *	start transaction isolation level serializable ;
+ 		 *	select * from ctl;
+ 		 *
+ 		 *	-- connection 2
+ 		 *	rollback;
+ 		 *
+ 		 *	-- connection 3
+ 		 *	select * from re<Tab><Tab>[nothing shows]ceipt;
+ 		 *	> no connection to the server
+ 		 *	> The connection to the server was lost. Attempting reset: Succeeded.
+ 		 *----------------------------------------------------------------
+ 		 */
+ 		/* If there's a new tuple to key on, return to avoid duplicate work. */
+ 		if (ItemPointerIsValid(&(tuple->t_data->t_ctid))
+ 			&& !ItemPointerEquals(&(tuple->t_self), &(tuple->t_data->t_ctid)))
+ 			return;
+ 
+ 		/*
+ 		 * We may bail out if previous xmax aborted, or if it committed but
+ 		 * only locked the tuple without updating it.
+ 		 */
+ 		if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED))
+ 			return;
+ 
+ 		/*
+ 		 * If there's a valid xmax, it must be from a concurrent transaction,
+ 		 * since it deleted a tuple which is visible to us.
+ 		 */
+ 		xid = HeapTupleHeaderGetXmax(tuple->t_data);
+ 		if (!TransactionIdIsValid(xid))
+ 			return;
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * We would read this row, but it isn't visible to us.
+ 		 */
+ 		xid = HeapTupleHeaderGetXmin(tuple->t_data);
+ 	}
+ 
+ 	/*
+ 	 * It's OK to look for conflicts with a share lock, and record them with
+ 	 * an exclusive lock when found; we just have to release the shared lock
+ 	 * before attempting to get the other lock, to prevent deadlocks.  We will
+ 	 * need to recheck that the entry still exists after getting the stronger
+ 	 * lock, just in case it rolled back in the window where we weren't
+ 	 * holding a lock.
+ 	 */
+ 	sxidtag.xid = xid;
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	sxid = (SERIALIZABLEXID *)
+ 		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ 	if (!sxid)
+ 	{
+ 		/* It's not serializable or otherwise not important. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 	sxact = sxid->myXact;
+ 	if (sxact == MySerializableXact || sxact->rolledBack)
+ 	{
+ 		/* We can't conflict with our own transaction or one rolled back. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * If this is a read-only transaction and the writing transaction has
+ 	 * committed, and it doesn't have a rw-conflict out or has a conflict out
+ 	 * to a transaction which overlaps this transaction, then no conflict.
+ 	 */
+ 	if (XactReadOnly
+ 		&& SxactIsCommitted(sxact)
+ 		&& (!TransactionIdIsValid(sxact->outConflict)
+ 			|| (sxact != sxact->outConflict
+ 				&& (!SxactIsCommitted(sxact->outConflict)
+ 					|| XidIsConcurrent(sxact->outConflict->topXid)))))
+ 	{
+ 		/* Read-only transaction will appear to run first.	No conflict. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	sxacttag = sxact->tag;
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	/*
+ 	 * Make sure we have somewhere to record a conflict against this
+ 	 * transaction.
+ 	 */
+ 	EnsureMySerializableXidExists();
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	sxact = (SERIALIZABLEXACT *)
+ 		hash_search(SerializableXactHash, &sxacttag, HASH_FIND, NULL);
+ 	if (!sxact)
+ 	{
+ 		/* It must have been cleaned up, which means it wasn't useful. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 	xid = sxact->topXid;
+ 	if (!XidIsConcurrent(xid))
+ 	{
+ 		/* This write was already in our snapshot; no conflict. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Flag the conflict.  But first, if this conflict creates a dangerous
+ 	 * structure, ereport an error.
+ 	 */
+ 	FlagRWConflict((SERIALIZABLEXACT *) MySerializableXact, sxact);
+ 	LWLockRelease(SerializableXactHashLock);
+ }
+ 
+ /*
+  * Check a particular target for rw-dependency conflict in.
+  */
+ static void
+ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	uint32		targettaghash;
+ 	LWLockId	partitionLock;
+ 	PREDICATELOCKTARGET *target;
+ 	PREDICATELOCK *predlock;
+ 
+ 	Assert(MySerializableXact != InvalidSerializableXact);
+ 
+ 	/* The same hash and LW lock apply to the lock target and the lock itself. */
+ 	targettaghash = PredicateLockTargetTagHashCode(targettag);
+ 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 	LWLockAcquire(partitionLock, LW_SHARED);
+ 	target = (PREDICATELOCKTARGET *)
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									targettag, targettaghash,
+ 									HASH_FIND, NULL);
+ 	if (!target)
+ 	{
+ 		/* Nothing has this target locked; we're done here. */
+ 		LWLockRelease(partitionLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Each lock for an overlapping transaction represents a conflict: a
+ 	 * rw-dependency in to this transaction.
+ 	 */
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(target->predicateLocks),
+ 					 &(target->predicateLocks),
+ 					 offsetof(PREDICATELOCK, targetLink));
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	while (predlock)
+ 	{
+ 		SHM_QUEUE  *predlocktargetlink;
+ 		PREDICATELOCK *nextpredlock;
+ 		SERIALIZABLEXACT *sxact;
+ 
+ 		predlocktargetlink = &(predlock->targetLink);
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(target->predicateLocks),
+ 						 predlocktargetlink,
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 
+ 		sxact = predlock->tag.myXact;
+ 		if (sxact == MySerializableXact)
+ 		{
+ 			/*
+ 			 * If we're getting a write lock on the tuple, we don't need a
+ 			 * predicate (SIREAD) lock. At this point our transaction already
+ 			 * has an ExclusiveRowLock on the relation, so we are OK to drop
+ 			 * the predicate lock on the tuple, if found, without fearing that
+ 			 * another write against the tuple will occur before the MVCC
+ 			 * information makes it to the buffer.
+ 			 */
+ 			if (GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+ 			{
+ 				uint32		predlockhashcode;
+ 				PREDICATELOCKTARGET *rmtarget = NULL;
+ 				PREDICATELOCK *rmpredlock;
+ 				LOCALPREDICATELOCK *locallock,
+ 						   *rmlocallock;
+ 
+ 				/*
+ 				 * This is a tuple on which we have a tuple predicate lock. We
+ 				 * only have shared LW locks now; release those, and get
+ 				 * exclusive locks only while we modify things.
+ 				 */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				LWLockRelease(partitionLock);
+ 				LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 				LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 				LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 				/*
+ 				 * Remove the predicate lock from shared memory, if it hasn't
+ 				 * been concurrently removed by an index page combine.
+ 				 */
+ 				predlockhashcode = PredicateLockHashCodeFromTargetHashCode(&(predlock->tag),
+ 															  targettaghash);
+ 				rmpredlock = (PREDICATELOCK *)
+ 					hash_search_with_hash_value(PredicateLockHash,
+ 												&(predlock->tag),
+ 												predlockhashcode,
+ 												HASH_FIND, NULL);
+ 				if (rmpredlock == predlock)
+ 				{
+ 					SHMQueueDelete(predlocktargetlink);
+ 					SHMQueueDelete(&(predlock->xactLink));
+ 
+ 					rmpredlock = (PREDICATELOCK *)
+ 						hash_search_with_hash_value(PredicateLockHash,
+ 													&(predlock->tag),
+ 													predlockhashcode,
+ 													HASH_REMOVE, NULL);
+ 					Assert(rmpredlock == predlock);
+ 
+ 					/*
+ 					 * When a target is no longer used, remove it.
+ 					 */
+ 					if (SHMQueueEmpty(&target->predicateLocks))
+ 					{
+ 						rmtarget = (PREDICATELOCKTARGET *)
+ 							hash_search_with_hash_value(PredicateLockTargetHash,
+ 														targettag,
+ 														targettaghash,
+ 														HASH_REMOVE, NULL);
+ 						Assert(rmtarget == target);
+ 					}
+ 
+ 					LWLockRelease(SerializableXactHashLock);
+ 					LWLockRelease(partitionLock);
+ 					LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 					locallock = (LOCALPREDICATELOCK *)
+ 						hash_search_with_hash_value(LocalPredicateLockHash,
+ 													targettag, targettaghash,
+ 													HASH_FIND, NULL);
+ 					Assert(locallock != NULL);
+ 					Assert(locallock->held);
+ 					locallock->held = false;
+ 
+ 					if (locallock->childLocks == 0)
+ 					{
+ 						rmlocallock = (LOCALPREDICATELOCK *)
+ 							hash_search_with_hash_value(LocalPredicateLockHash,
+ 													targettag, targettaghash,
+ 														HASH_REMOVE, NULL);
+ 						Assert(rmlocallock == locallock);
+ 					}
+ 
+ 					DecrementParentLocks(targettag);
+ 
+ 					if (rmtarget)
+ 						return;
+ 
+ 					LWLockAcquire(partitionLock, LW_SHARED);
+ 					nextpredlock = (PREDICATELOCK *)
+ 						SHMQueueNext(&(target->predicateLocks),
+ 									 &(target->predicateLocks),
+ 									 offsetof(PREDICATELOCK, targetLink));
+ 
+ 					LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 				}
+ 				else
+ 				{
+ 					LWLockAcquire(partitionLock, LW_SHARED);
+ 					LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 				}
+ 			}
+ 		}
+ 		else if (!(sxact->rolledBack)
+ 				 && (!SxactIsCommitted(sxact)
+ 					 || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ 											  sxact->finishedBefore))
+ 				 && sxact->outConflict != MySerializableXact
+ 				 && MySerializableXact->inConflict != sxact)
+ 		{
+ 			LWLockRelease(SerializableXactHashLock);
+ 			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 			FlagRWConflict(sxact, (SERIALIZABLEXACT *) MySerializableXact);
+ 
+ 			LWLockRelease(SerializableXactHashLock);
+ 			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		}
+ 
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 	LWLockRelease(partitionLock);
+ }
+ 
+ /*
+  * CheckForSerializableConflictIn
+  *		We are writing the given tuple.  If that indicates a rw-conflict
+  *		in from another serializable transaction, take appropriate action.
+  *
+  * Skip checking for any granularity for which a parameter is missing.
+  *
+  * A tuple update or delete is in conflict if we have a predicate lock
+  * against the relation or page in which the tuple exists, or against the
+  * tuple itself.  A tuple insert is in conflict only if there is a predicate
+  * lock against the entire relation.
+  *
+  * The call to this function also indicates that we need an entry in the
+  * serializable transaction hash table, so that this write's conflicts can
+  * be detected for the proper lifetime, which is until this transaction and
+  * all overlapping serializable transactions have completed.
+  */
+ void
+ CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple,
+ 							   const Buffer buffer)
+ {
+ 	PREDICATELOCKTARGETTAG targettag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	EnsureMySerializableXidExists();
+ 
+ 	/*
+ 	 * It is important that we check for locks from the finest granularity to
+ 	 * the coarsest granularity, so that granularity promotion doesn't cause
+ 	 * us to miss a lock.  The new (coarser) lock will be acquired before the
+ 	 * old (finer) locks are released.
+ 	 *
+ 	 * It is not possible to take and hold a lock across the checks for all
+ 	 * granularities because each target could be in a separate partition.
+ 	 */
+ 	if (tuple != NULL)
+ 	{
+ 		SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+ 										 relation->rd_node.dbNode,
+ 										 relation->rd_id,
+ 						 ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)),
+ 					   ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)));
+ 		CheckTargetForConflictsIn(&targettag);
+ 	}
+ 
+ 	if (BufferIsValid(buffer))
+ 	{
+ 		SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id,
+ 										BufferGetBlockNumber(buffer));
+ 		CheckTargetForConflictsIn(&targettag);
+ 	}
+ 
+ 	SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id);
+ 	CheckTargetForConflictsIn(&targettag);
+ }
+ 
+ /*
+  * Flag a rw-dependency between two serializable transactions.
+  * If a conflict field is invalid set it to the other transaction,
+  * if it's already the other transaction leave it alone, otherwise
+  * use self-reference (so we don't need to keep a list).
+  *
+  * The caller is responsible for ensuring that we have a LW lock on
+  * the transaction hash table.
+  */
+ static void
+ FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+ {
+ 	Assert(reader != writer);
+ 
+ 	/* First, see if this conflict causes failure. */
+ 	OnConflict_CheckForSerializationFailure(reader, writer);
+ 
+ 	/* Actually do the conflict flagging. */
+ 	if (writer->inConflict == InvalidSerializableXact
+ 		|| writer->inConflict->rolledBack)
+ 		writer->inConflict = reader;
+ 	else if (writer->inConflict != reader)
+ 		writer->inConflict = writer;
+ 	if (reader->outConflict == InvalidSerializableXact
+ 		|| reader->outConflict->rolledBack)
+ 		reader->outConflict = writer;
+ 	else if (reader->outConflict != writer)
+ 		reader->outConflict = reader;
+ }
+ 
+ /*
+  * Check whether we should roll back one of these transactions
+  * instead of flagging a conflict.
+  */
+ static void
+ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ 										const SERIALIZABLEXACT *writer)
+ {
+ 	bool		failure;
+ 
+ 	Assert(LWLockHeldByMe(SerializableXactHashLock));
+ 
+ 	failure = false;
+ 
+ 	if (writer->inConflict != reader
+ 		&& writer->outConflict != InvalidSerializableXact
+ 		&& !(writer->outConflict->rolledBack))
+ 	{
+ 		/* The writer is or is becoming a pivot. */
+ 		/* Self-reference prevents checking commit sequence. */
+ 		if (writer->outConflict == writer
+ 
+ 		/*
+ 		 * TODO SSI: Resolve this performance tweak issue.
+ 		 *
+ 		 * Back-and-forth reference is write skew; thus doomed; however,
+ 		 * rolling back here increases chances that a retry will still fail.
+ 		 * It may be better to let it happen at commit time.  Only performance
+ 		 * testing can determine whether the next line should be used.
+ 		 *
+ 		 * Leaving it out would be *especially* valuable if the PreCommit
+ 		 * checking could be changed to allow a commit in a situation where it
+ 		 * is leaving another transaction in a state where a commit must fail
+ 		 * -- when the doomed transaction eventually tries to commit, it would
+ 		 * probably be at a time when an immediate retry is very likely to
+ 		 * succeed.
+ 		 */
+ 		/* || writer->outConflict == reader */
+ 			)
+ 			failure = true;
+ 		else if (SxactIsCommitted(writer->outConflict))
+ 		{
+ 			if (SxactCommittedBefore(writer->outConflict, writer)
+ 				&& SxactCommittedBefore(writer->outConflict, reader))
+ 				/* The out side of the pivot committed first. */
+ 				failure = true;
+ 		}
+ 		else
+ 		{
+ 			if (writer->outConflict->inConflict == writer->outConflict)
+ 				/* Self-reference will prevent checking at commit. */
+ 				failure = true;
+ 		}
+ 	}
+ 
+ 	if (reader->outConflict != writer
+ 		&& reader->inConflict != InvalidSerializableXact
+ 		&& !(reader->inConflict->rolledBack))
+ 	{
+ 		/* The reader is or is becoming a pivot. */
+ 		if (SxactIsCommitted(writer))
+ 		{
+ 			if (SxactCommittedBefore(writer, reader)
+ 				&& (reader->inConflict == reader
+ 					|| SxactCommittedBefore(writer, reader->inConflict)))
+ 				/* The out side committed first, as far as we can tell. */
+ 				failure = true;
+ 		}
+ 		else if (writer->inConflict != InvalidSerializableXact
+ 				 && writer->inConflict != reader)
+ 			/* Self-reference will prevent checking at commit. */
+ 			failure = true;
+ 	}
+ 
+ 	if (failure)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+ 				 errhint("The transaction might succeed if retried.")));
+ }
+ 
+ /*
+  * PreCommit_CheckForSerializableConflicts
+  *		Check for dangerous structures in a serializable transaction
+  *		at commit.
+  *
+  * We're checking for a dangerous structure as each conflict is recorded.
+  * The only way we could have a problem at commit is if this is the "out"
+  * side of a pivot, and neither the "in" side or the pivot itself has yet
+  * committed.
+  */
+ void
+ PreCommit_CheckForSerializationFailure(void)
+ {
+ 	if (MySerializableXact == InvalidSerializableXact)
+ 		return;
+ 
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Checking at conflict detection should only allow self-reference in if
+ 	 * this transaction is on the on the out side of a pivot, so
+ 	 * self-reference is OK here.
+ 	 */
+ 	if (MySerializableXact->inConflict != InvalidSerializableXact
+ 		&& MySerializableXact->inConflict != MySerializableXact
+ 		&& !(MySerializableXact->inConflict->rolledBack)
+ 	 && MySerializableXact->inConflict->inConflict != InvalidSerializableXact
+ 		&& !SxactIsCommitted(MySerializableXact->inConflict)
+ 		&& !SxactIsCommitted(MySerializableXact->inConflict->inConflict))
+ 	{
+ 		MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+ 				 errhint("The transaction might succeed if retried.")));
+ 	}
+ 
+ 	MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
+ 	LWLockRelease(SerializableXactHashLock);
+ }
*** a/src/backend/utils/adt/lockfuncs.c
--- b/src/backend/utils/adt/lockfuncs.c
***************
*** 17,22 ****
--- 17,23 ----
  #include "miscadmin.h"
  #include "storage/proc.h"
  #include "utils/builtins.h"
+ #include "storage/predicate.h"
  
  
  /* This must match enum LockTagType! */
***************
*** 37,42 **** typedef struct
--- 38,45 ----
  {
  	LockData   *lockData;		/* state data from lmgr */
  	int			currIdx;		/* current PROCLOCK index */
+ 	PredicateLockData *predLockData; /* state data for pred locks */
+ 	int                predLockIdx;  /* current index for pred lock */
  } PG_Lock_Status;
  
  
***************
*** 69,74 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 72,78 ----
  	FuncCallContext *funcctx;
  	PG_Lock_Status *mystatus;
  	LockData   *lockData;
+ 	PredicateLockData *predLockData;
  
  	if (SRF_IS_FIRSTCALL())
  	{
***************
*** 126,131 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 130,137 ----
  
  		mystatus->lockData = GetLockStatusData();
  		mystatus->currIdx = 0;
+ 		mystatus->predLockData = GetPredicateLockStatusData();
+ 		mystatus->predLockIdx = 0;
  
  		MemoryContextSwitchTo(oldcontext);
  	}
***************
*** 303,308 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 309,379 ----
  		SRF_RETURN_NEXT(funcctx, result);
  	}
  
+ 	/*
+ 	 * Have returned all regular locks. Now start on the SIREAD
+ 	 * predicate locks.
+ 	 */
+ 	predLockData = mystatus->predLockData;
+ 	if (mystatus->predLockIdx < predLockData->nelements)
+ 	{
+ 		PREDICATELOCKTARGETTAG *predTag = &(predLockData->locktags[mystatus->predLockIdx]);
+ 		SERIALIZABLEXACT *xact = &(predLockData->xacts[mystatus->predLockIdx]);
+ 		Datum		values[14];
+ 		bool		nulls[14];
+ 		HeapTuple	tuple;
+ 		Datum		result;
+ 
+ 		mystatus->predLockIdx++;
+ 		
+ 		/*
+ 		 * Form tuple with appropriate data.
+ 		 */
+ 		MemSet(values, 0, sizeof(values));
+ 		MemSet(nulls, false, sizeof(nulls));
+ 
+ 		/* lock type */
+ 		values[0] = CStringGetTextDatum("tuple");
+ 		
+ 		/* lock target */
+ 		values[1] = GET_PREDICATELOCKTARGETTAG_DB(*predTag);
+ 		values[2] = GET_PREDICATELOCKTARGETTAG_RELATION(*predTag);
+ 		values[4] = GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag);
+ 		if (values[4] == InvalidOffsetNumber)
+ 		{
+ 			values[0] = CStringGetTextDatum("page");
+ 			nulls[4] = true;
+ 		}
+ 		values[3] = GET_PREDICATELOCKTARGETTAG_PAGE(*predTag);
+ 		if (values[3] == InvalidBlockNumber)
+ 		{
+ 			values[0] = CStringGetTextDatum("relation");
+ 			nulls[3] = true;
+ 		}
+ 
+ 		/* these fields are targets for other types of locks */
+ 		nulls[5] = true; 		/* virtualxid */
+ 		nulls[6] = true;		/* transactionid */
+ 		nulls[7] = true;		/* classid */
+ 		nulls[8] = true;		/* objid */
+ 		nulls[9] = true;		/* objsubid */
+ 
+ 		/* lock holder */
+ 		values[10] = VXIDGetDatum(xact->tag.vxid.backendId,
+ 								  xact->tag.vxid.localTransactionId);
+ 		nulls[11] = true;		/* pid */
+ 
+ 		/*
+ 		 * Lock mode. Currently all predicate locks are SIReadLocks,
+ 		 * which are always held (never waiting)
+ 		 */
+ 		values[12] = CStringGetTextDatum("SIReadLock");
+ 		values[13] = BoolGetDatum(true);
+ 
+ 		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ 		result = HeapTupleGetDatum(tuple);
+ 		SRF_RETURN_NEXT(funcctx, result);	
+ 	}
+ 	
  	SRF_RETURN_DONE(funcctx);
  }
  
*** a/src/backend/utils/adt/ri_triggers.c
--- b/src/backend/utils/adt/ri_triggers.c
***************
*** 3308,3314 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan,
  	/*
  	 * In READ COMMITTED mode, we just need to use an up-to-date regular
  	 * snapshot, and we will see all rows that could be interesting. But in
! 	 * SERIALIZABLE mode, we can't change the transaction snapshot. If the
  	 * caller passes detectNewRows == false then it's okay to do the query
  	 * with the transaction snapshot; otherwise we use a current snapshot, and
  	 * tell the executor to error out if it finds any rows under the current
--- 3308,3314 ----
  	/*
  	 * In READ COMMITTED mode, we just need to use an up-to-date regular
  	 * snapshot, and we will see all rows that could be interesting. But in
! 	 * xact-snapshot-based modes, we can't change the transaction snapshot. If the
  	 * caller passes detectNewRows == false then it's okay to do the query
  	 * with the transaction snapshot; otherwise we use a current snapshot, and
  	 * tell the executor to error out if it finds any rows under the current
***************
*** 3316,3322 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan,
  	 * that SPI_execute_snapshot will register the snapshots, so we don't need
  	 * to bother here.
  	 */
! 	if (IsXactIsoLevelSerializable && detectNewRows)
  	{
  		CommandCounterIncrement();		/* be sure all my own work is visible */
  		test_snapshot = GetLatestSnapshot();
--- 3316,3322 ----
  	 * that SPI_execute_snapshot will register the snapshots, so we don't need
  	 * to bother here.
  	 */
! 	if (IsXactIsoLevelXactSnapshotBased && detectNewRows)
  	{
  		CommandCounterIncrement();		/* be sure all my own work is visible */
  		test_snapshot = GetLatestSnapshot();
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 58,63 ****
--- 58,64 ----
  #include "replication/walsender.h"
  #include "storage/bufmgr.h"
  #include "storage/fd.h"
+ #include "storage/predicate.h"
  #include "tcop/tcopprot.h"
  #include "tsearch/ts_cache.h"
  #include "utils/builtins.h"
***************
*** 1635,1640 **** static struct config_int ConfigureNamesInt[] =
--- 1636,1652 ----
  	},
  
  	{
+ 		{"max_predicate_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT,
+ 			gettext_noop("Sets the maximum number of predicate locks per transaction."),
+ 			gettext_noop("The shared predicate lock table is sized on the assumption that "
+ 			  "at most max_predicate_locks_per_transaction * max_connections distinct "
+ 						 "objects will need to be locked at any one time.")
+ 		},
+ 		&max_predicate_locks_per_xact,
+ 		64, 10, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"authentication_timeout", PGC_SIGHUP, CONN_AUTH_SECURITY,
  			gettext_noop("Sets the maximum allowed time to complete client authentication."),
  			NULL,
*** a/src/backend/utils/resowner/resowner.c
--- b/src/backend/utils/resowner/resowner.c
***************
*** 261,267 **** ResourceOwnerReleaseInternal(ResourceOwner owner,
--- 261,270 ----
  			 * the top of the recursion.
  			 */
  			if (owner == TopTransactionResourceOwner)
+ 			{
  				ProcReleaseLocks(isCommit);
+ 				ReleasePredicateLocks(isCommit);
+ 			}
  		}
  		else
  		{
*** a/src/backend/utils/time/snapmgr.c
--- b/src/backend/utils/time/snapmgr.c
***************
*** 37,44 ****
  
  
  /*
!  * CurrentSnapshot points to the only snapshot taken in a serializable
!  * transaction, and to the latest one taken in a read-committed transaction.
   * SecondarySnapshot is a snapshot that's always up-to-date as of the current
   * instant, even on a serializable transaction.  It should only be used for
   * special-purpose code (say, RI checking.)
--- 37,44 ----
  
  
  /*
!  * CurrentSnapshot points to the only snapshot taken in a xact-snapshot-based
!  * transaction; otherwise to the latest one taken.
   * SecondarySnapshot is a snapshot that's always up-to-date as of the current
   * instant, even on a serializable transaction.  It should only be used for
   * special-purpose code (say, RI checking.)
***************
*** 97,107 **** static int	RegisteredSnapshots = 0;
  bool		FirstSnapshotSet = false;
  
  /*
!  * Remembers whether this transaction registered a serializable snapshot at
   * start.  We cannot trust FirstSnapshotSet in combination with
!  * IsXactIsoLevelSerializable, because GUC may be reset before us.
   */
! static bool registered_serializable = false;
  
  
  static Snapshot CopySnapshot(Snapshot snapshot);
--- 97,107 ----
  bool		FirstSnapshotSet = false;
  
  /*
!  * Remembers whether this transaction registered a transaction-based snapshot at
   * start.  We cannot trust FirstSnapshotSet in combination with
!  * IsXactIsoLevelXactSnapshotBased, because GUC may be reset before us.
   */
! static bool registered_xact_snapshot = false;
  
  
  static Snapshot CopySnapshot(Snapshot snapshot);
***************
*** 130,150 **** GetTransactionSnapshot(void)
  		FirstSnapshotSet = true;
  
  		/*
! 		 * In serializable mode, the first snapshot must live until end of
! 		 * xact regardless of what the caller does with it, so we must
! 		 * register it internally here and unregister it at end of xact.
  		 */
! 		if (IsXactIsoLevelSerializable)
  		{
  			CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot,
  												TopTransactionResourceOwner);
! 			registered_serializable = true;
  		}
  
  		return CurrentSnapshot;
  	}
  
! 	if (IsXactIsoLevelSerializable)
  		return CurrentSnapshot;
  
  	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
--- 130,153 ----
  		FirstSnapshotSet = true;
  
  		/*
! 		 * In xact-snapshot-based isolation levels, the first snapshot must
! 		 * live until end of xact regardless of what the caller does with it,
! 		 * so we must register it internally here and unregister it at end of
! 		 * xact.
  		 */
! 		if (IsXactIsoLevelXactSnapshotBased)
  		{
  			CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot,
  												TopTransactionResourceOwner);
! 			registered_xact_snapshot = true;
! 			if (IsXactIsoLevelFullySerializable)
! 				RegisterSerializableTransaction(CurrentSnapshot);
  		}
  
  		return CurrentSnapshot;
  	}
  
! 	if (IsXactIsoLevelXactSnapshotBased)
  		return CurrentSnapshot;
  
  	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
***************
*** 155,161 **** GetTransactionSnapshot(void)
  /*
   * GetLatestSnapshot
   *		Get a snapshot that is up-to-date as of the current instant,
!  *		even if we are executing in SERIALIZABLE mode.
   */
  Snapshot
  GetLatestSnapshot(void)
--- 158,164 ----
  /*
   * GetLatestSnapshot
   *		Get a snapshot that is up-to-date as of the current instant,
!  *		even if we are executing in xact-snapshot-based mode.
   */
  Snapshot
  GetLatestSnapshot(void)
***************
*** 515,527 **** void
  AtEarlyCommit_Snapshot(void)
  {
  	/*
! 	 * On a serializable transaction we must unregister our private refcount
! 	 * to the serializable snapshot.
  	 */
! 	if (registered_serializable)
  		UnregisterSnapshotFromOwner(CurrentSnapshot,
  									TopTransactionResourceOwner);
! 	registered_serializable = false;
  
  }
  
--- 518,530 ----
  AtEarlyCommit_Snapshot(void)
  {
  	/*
! 	 * On a xact-snapshot-based transaction we must unregister our private
! 	 * refcount to the xact snapshot.
  	 */
! 	if (registered_xact_snapshot)
  		UnregisterSnapshotFromOwner(CurrentSnapshot,
  									TopTransactionResourceOwner);
! 	registered_xact_snapshot = false;
  
  }
  
***************
*** 557,561 **** AtEOXact_Snapshot(bool isCommit)
  	SecondarySnapshot = NULL;
  
  	FirstSnapshotSet = false;
! 	registered_serializable = false;
  }
--- 560,564 ----
  	SecondarySnapshot = NULL;
  
  	FirstSnapshotSet = false;
! 	registered_xact_snapshot = false;
  }
*** a/src/include/access/heapam.h
--- b/src/include/access/heapam.h
***************
*** 82,89 **** extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
  extern bool heap_fetch(Relation relation, Snapshot snapshot,
  		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
  		   Relation stats_relation);
! extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer,
! 					   Snapshot snapshot, bool *all_dead);
  extern bool heap_hot_search(ItemPointer tid, Relation relation,
  				Snapshot snapshot, bool *all_dead);
  
--- 82,89 ----
  extern bool heap_fetch(Relation relation, Snapshot snapshot,
  		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
  		   Relation stats_relation);
! extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
! 					   Buffer buffer, Snapshot snapshot, bool *all_dead);
  extern bool heap_hot_search(ItemPointer tid, Relation relation,
  				Snapshot snapshot, bool *all_dead);
  
*** a/src/include/access/xact.h
--- b/src/include/access/xact.h
***************
*** 32,41 **** extern int	DefaultXactIsoLevel;
  extern int	XactIsoLevel;
  
  /*
!  * We only implement two isolation levels internally.  This macro should
!  * be used to check which one is selected.
   */
! #define IsXactIsoLevelSerializable (XactIsoLevel >= XACT_REPEATABLE_READ)
  
  /* Xact read-only state */
  extern bool DefaultXactReadOnly;
--- 32,45 ----
  extern int	XactIsoLevel;
  
  /*
!  * We implement three isolation levels internally.
!  * The two stronger ones use one snapshot per database transaction;
!  * the others use one snapshot per statement.
!  * Serializable uses predicate locks.
!  * These macros should be used to check which isolation level is selected.
   */
! #define IsXactIsoLevelXactSnapshotBased (XactIsoLevel >= XACT_REPEATABLE_READ)
! #define IsXactIsoLevelFullySerializable (XactIsoLevel == XACT_SERIALIZABLE)
  
  /* Xact read-only state */
  extern bool DefaultXactReadOnly;
*** a/src/include/catalog/pg_am.h
--- b/src/include/catalog/pg_am.h
***************
*** 49,54 **** CATALOG(pg_am,2601)
--- 49,55 ----
  	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
  	bool		amstorage;		/* can storage type differ from column type? */
  	bool		amclusterable;	/* does AM support cluster command? */
+ 	bool		ampredlocks;	/* does AM handle predicate locks? */
  	Oid			amkeytype;		/* type of data in index, or InvalidOid */
  	regproc		aminsert;		/* "insert this tuple" function */
  	regproc		ambeginscan;	/* "start new scan" function */
***************
*** 76,82 **** typedef FormData_pg_am *Form_pg_am;
   *		compiler constants for pg_am
   * ----------------
   */
! #define Natts_pg_am						26
  #define Anum_pg_am_amname				1
  #define Anum_pg_am_amstrategies			2
  #define Anum_pg_am_amsupport			3
--- 77,83 ----
   *		compiler constants for pg_am
   * ----------------
   */
! #define Natts_pg_am						27
  #define Anum_pg_am_amname				1
  #define Anum_pg_am_amstrategies			2
  #define Anum_pg_am_amsupport			3
***************
*** 89,124 **** typedef FormData_pg_am *Form_pg_am;
  #define Anum_pg_am_amsearchnulls		10
  #define Anum_pg_am_amstorage			11
  #define Anum_pg_am_amclusterable		12
! #define Anum_pg_am_amkeytype			13
! #define Anum_pg_am_aminsert				14
! #define Anum_pg_am_ambeginscan			15
! #define Anum_pg_am_amgettuple			16
! #define Anum_pg_am_amgetbitmap			17
! #define Anum_pg_am_amrescan				18
! #define Anum_pg_am_amendscan			19
! #define Anum_pg_am_ammarkpos			20
! #define Anum_pg_am_amrestrpos			21
! #define Anum_pg_am_ambuild				22
! #define Anum_pg_am_ambulkdelete			23
! #define Anum_pg_am_amvacuumcleanup		24
! #define Anum_pg_am_amcostestimate		25
! #define Anum_pg_am_amoptions			26
  
  /* ----------------
   *		initial contents of pg_am
   * ----------------
   */
  
! DATA(insert OID = 403 (  btree	5 1 t t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
! DATA(insert OID = 405 (  hash	1 1 f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
! DATA(insert OID = 783 (  gist	0 7 f f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
! DATA(insert OID = 2742 (  gin	0 5 f f f t t f f t f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
--- 90,126 ----
  #define Anum_pg_am_amsearchnulls		10
  #define Anum_pg_am_amstorage			11
  #define Anum_pg_am_amclusterable		12
! #define Anum_pg_am_ampredlocks			13
! #define Anum_pg_am_amkeytype			14
! #define Anum_pg_am_aminsert				15
! #define Anum_pg_am_ambeginscan			16
! #define Anum_pg_am_amgettuple			17
! #define Anum_pg_am_amgetbitmap			18
! #define Anum_pg_am_amrescan				19
! #define Anum_pg_am_amendscan			20
! #define Anum_pg_am_ammarkpos			21
! #define Anum_pg_am_amrestrpos			22
! #define Anum_pg_am_ambuild				23
! #define Anum_pg_am_ambulkdelete			24
! #define Anum_pg_am_amvacuumcleanup		25
! #define Anum_pg_am_amcostestimate		26
! #define Anum_pg_am_amoptions			27
  
  /* ----------------
   *		initial contents of pg_am
   * ----------------
   */
  
! DATA(insert OID = 403 (  btree	5 1 t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
! DATA(insert OID = 405 (  hash	1 1 f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
! DATA(insert OID = 783 (  gist	0 7 f f f t t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
! DATA(insert OID = 2742 (  gin	0 5 f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
*** a/src/include/storage/lwlock.h
--- b/src/include/storage/lwlock.h
***************
*** 27,32 ****
--- 27,36 ----
  #define LOG2_NUM_LOCK_PARTITIONS  4
  #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
  
+ /* Number of partitions the shared predicate lock tables are divided into */
+ #define LOG2_NUM_PREDICATELOCK_PARTITIONS  4
+ #define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+ 
  /*
   * We have a number of predefined LWLocks, plus a bunch of LWLocks that are
   * dynamically assigned (e.g., for shared buffers).  The LWLock structures
***************
*** 70,81 **** typedef enum LWLockId
  	RelationMappingLock,
  	AsyncCtlLock,
  	AsyncQueueLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
  	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
! 	NumFixedLWLocks = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
  
  	MaxDynamicLWLock = 1000000000
  } LWLockId;
--- 74,89 ----
  	RelationMappingLock,
  	AsyncCtlLock,
  	AsyncQueueLock,
+ 	SerializableXactHashLock,
+ 	SerializableFinishedListLock,
+ 	SerializablePredicateLockListLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
  	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
+ 	FirstPredicateLockMgrLock = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
! 	NumFixedLWLocks = FirstPredicateLockMgrLock + NUM_PREDICATELOCK_PARTITIONS,
  
  	MaxDynamicLWLock = 1000000000
  } LWLockId;
*** /dev/null
--- b/src/include/storage/predicate.h
***************
*** 0 ****
--- 1,174 ----
+ /*-------------------------------------------------------------------------
+  *
+  * predicate.h
+  *	  POSTGRES predicate locking definitions.
+  *
+  *
+  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #ifndef PREDICATE_H
+ #define PREDICATE_H
+ 
+ #include "access/htup.h"
+ #include "utils/snapshot.h"
+ 
+ /* GUC variables */
+ extern int	max_predicate_locks_per_xact;
+ 
+ /*
+  * The SERIALIZABLEXACTTAG struct identifies a serializable transaction.
+  */
+ typedef struct SERIALIZABLEXACTTAG
+ {
+ 	VirtualTransactionId vxid;	/* We always have one of these. */
+ } SERIALIZABLEXACTTAG;
+ 
+ /*
+  * Information needed for each serializable database transaction to support SSI techniques.
+  * TODO SSI: Should inConflict and outConflict be lists?  That would allow us to reduce
+  *			 false positives, *and* would allow us to guarantee that an immediate retry
+  *			 of a transaction would never fail on the exact same conflicts.
+  *			 The RAM doesn't look like it would be the limiting factor, but CPU time might
+  *			 be -- we should have baseline benchmarks before attempting this.
+  */
+ typedef struct SERIALIZABLEXACT
+ {
+ 	/* hash key */
+ 	SERIALIZABLEXACTTAG tag;
+ 
+ 	/* data */
+ 	struct SERIALIZABLEXACT *outConflict;		/* ptr to write transaction
+ 												 * whose data we couldn't
+ 												 * read. invalid means no
+ 												 * conflict; self-reference
+ 												 * means multiple or
+ 												 * committed. */
+ 	struct SERIALIZABLEXACT *inConflict;		/* ptr to read transaction
+ 												 * which couldn't see our
+ 												 * write. invalid means no
+ 												 * conflict; self-reference
+ 												 * means multiple or
+ 												 * committed. */
+ 	TransactionId topXid;		/* top level xid for the transaction, if one
+ 								 * exists */
+ 	TransactionId finishedBefore;		/* invalid means still running; else
+ 										 * the struct expires when no tags <
+ 										 * this. */
+ 	TransactionId xmin;			/* the transaction's snapshot xmin */
+ 	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
+ 	SHM_QUEUE	xids;			/* list of associated SERIALIZABLEXID objects */
+ 	SHM_QUEUE	finishedLink;	/* list link in
+ 								 * FinishedSerializableTransactions */
+ 	bool		rolledBack;		/* ignore conflicts when true; allows deferred
+ 								 * cleanup */
+ } SERIALIZABLEXACT;
+ 
+ 
+ typedef enum PredicateLockTargetType
+ {
+ 	PREDLOCKTAG_RELATION,
+ 	PREDLOCKTAG_PAGE,
+ 	PREDLOCKTAG_TUPLE
+ 	/* TODO Other types may be needed for index locking */
+ }	PredicateLockTargetType;
+ 
+ /*
+  * The PREDICATELOCKTARGETTAG struct is defined to fit into 16
+  * bytes with no padding.  Note that this would need adjustment if we were
+  * to widen Oid or BlockNumber to more than 32 bits.
+  */
+ typedef struct PREDICATELOCKTARGETTAG
+ {
+ 	uint32		locktag_field1; /* a 32-bit ID field */
+ 	uint32		locktag_field2; /* a 32-bit ID field */
+ 	uint32		locktag_field3; /* a 32-bit ID field */
+ 	uint16		locktag_field4; /* a 16-bit ID field */
+ 	uint16		locktag_field5; /* a 16-bit ID field */
+ } PREDICATELOCKTARGETTAG;
+ 
+ /*
+  * These macros define how we map logical IDs of lockable objects into
+  * the physical fields of PREDICATELOCKTARGETTAG.	Use these to set up values,
+  * rather than accessing the fields directly.  Note multiple eval of target!
+  *
+  * TODO SSI: If we always use the same fields for the same type of value,
+  * we should rename these.	Holding off until it's clear there are no exceptions.
+  * Since indexes are relations with blocks and tuples, it's looking likely that
+  * the rename will be possible.  If not, we may need to divide the last field
+  * and use part of it for a target type, so that we know how to interpret the
+  * data..
+  */
+ #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = InvalidBlockNumber, \
+ 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = (blocknum), \
+ 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = (blocknum), \
+ 	 (locktag).locktag_field4 = (offnum), \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
+ 	((locktag).locktag_field1)
+ #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
+ 	((locktag).locktag_field2)
+ #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
+ 	((locktag).locktag_field3)
+ #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
+ 	((locktag).locktag_field4)
+ #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
+ 	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
+ 	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
+ 	  PREDLOCKTAG_RELATION))
+ 
+ typedef struct PredicateLockData
+ {
+ 	int			nelements;
+ 	PREDICATELOCKTARGETTAG *locktags;
+ 	SERIALIZABLEXACT *xacts;
+ } PredicateLockData;
+ 
+ /*
+  * function prototypes
+  */
+ 
+ /* housekeeping for shared memory predicate lock structures */
+ extern void InitPredicateLocks(void);
+ extern Size PredicateLockShmemSize(void);
+ 
+ /* predicate lock reporting */
+ extern PredicateLockData *GetPredicateLockStatusData(void);
+ 
+ /* predicate lock maintenance */
+ extern void RegisterSerializableTransaction(const Snapshot snapshot);
+ extern void PredicateLockRelation(const Relation relation);
+ extern void PredicateLockPage(const Relation relation, const BlockNumber blkno);
+ extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple);
+ extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
+ extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
+ extern void ReleasePredicateLocks(const bool isCommit);
+ 
+ /* conflict detection (may also trigger rollback) */
+ extern void CheckForSerializableConflictOut(const bool valid, const Relation relation, const HeapTuple tuple, const Buffer buffer);
+ extern void CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple, const Buffer buffer);
+ 
+ /* final rollback checking */
+ extern void PreCommit_CheckForSerializationFailure(void);
+ 
+ #endif   /* PREDICATE_H */
*** a/src/include/storage/shmem.h
--- b/src/include/storage/shmem.h
***************
*** 70,74 **** extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem);
--- 70,75 ----
  extern Pointer SHMQueueNext(SHM_QUEUE *queue, SHM_QUEUE *curElem,
  			 Size linkOffset);
  extern bool SHMQueueEmpty(SHM_QUEUE *queue);
+ extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
  
  #endif   /* SHMEM_H */
*** a/src/test/regress/GNUmakefile
--- b/src/test/regress/GNUmakefile
***************
*** 135,140 **** tablespace-setup:
--- 135,157 ----
  
  
  ##
+ ## Prepare for dtester tests
+ ##
+ pg_dtester.py: pg_dtester.py.in GNUmakefile $(top_builddir)/src/Makefile.global
+ 	sed -e 's,@bindir@,$(bindir),g' \
+ 	    -e 's,@libdir@,$(libdir),g' \
+ 	    -e 's,@pkglibdir@,$(pkglibdir),g' \
+ 	    -e 's,@datadir@,$(datadir),g' \
+ 	    -e 's/@VERSION@/$(VERSION)/g' \
+ 	    -e 's/@host_tuple@/$(host_tuple)/g' \
+ 	    -e 's,@GMAKE@,$(MAKE),g' \
+ 	    -e 's/@enable_shared@/$(enable_shared)/g' \
+ 	    -e 's/@GCC@/$(GCC)/g' \
+ 	  $< >$@
+ 	chmod a+x $@
+ 
+ 
+ ##
  ## Run tests
  ##
  
***************
*** 152,157 **** installcheck-parallel: all
--- 169,179 ----
  standbycheck: all
  	$(pg_regress_call) --psqldir=$(PSQLDIR) --schedule=$(srcdir)/standby_schedule --use-existing
  
+ dcheck: pg_dtester.py
+ 	./pg_dtester.py --temp-install --top-builddir=$(top_builddir) \
+         --multibyte=$(MULTIBYTE) $(MAXCONNOPT) $(NOLOCALE)
+ 
+ 
  # old interfaces follow...
  
  runcheck: check
*** /dev/null
--- b/src/test/regress/pg_dtester.py.in
***************
*** 0 ****
--- 1,1626 ----
+ #!/usr/bin/python
+ 
+ #-------------------------------------------------------------------------
+ #
+ # dtester.py.in
+ #
+ #	 Sample test suite running two concurrent transactions, showing
+ #    off some capabilities of dtester.
+ #
+ # Copyright (c) 2006-2010, Markus Wanner
+ #
+ #-------------------------------------------------------------------------
+ 
+ import re, os, sys, getopt
+ from twisted.internet import defer, reactor
+ from twisted.python import failure
+ 
+ from dtester.events import EventMatcher, EventSource, Event, \
+ 	ProcessOutputEvent, ProcessErrorEvent, ProcessEndedEvent
+ from dtester.exceptions import TestAborted, TestFailure
+ from dtester.test import TestSuite, BaseTest, SyncTest
+ from dtester.reporter import StreamReporter, CursesReporter
+ from dtester.runner import Runner, Timeout
+ 
+ # ******  definition of tests and suites  ***********************************
+ 
+ class InstallationSuite(TestSuite):
+ 
+ 	setUpDescription = "creating temporary installation"
+ 	tearDownDescription = "removing temporary installation"
+ 
+ 	needs = (('shell', "IShell or something"),)
+ 
+ 	def setUp(self):
+ 		# inherit getConfig from the shell
+ 		setattr(self, 'getConfig', self.shell.getConfig)
+ 		setattr(self, 'runCommand', self.shell.runCommand)
+ 		setattr(self, 'recursive_remove', self.shell.recursive_remove)
+ 
+ 		# (re) create an installation directory
+ 		self.pg_inst_dir = self.shell.getConfig('inst_dir')
+ 		if os.path.exists(self.pg_inst_dir):
+ 			self.shell.recursive_remove(self.pg_inst_dir)
+ 		os.mkdir(self.pg_inst_dir)
+ 
+ 		# install into that directory
+ 		proc = self.shell.runCommand('make', 'make',
+ 			args=['make', '-C', self.shell.getConfig('top-builddir'),
+ 				  'DESTDIR=%s' % self.pg_inst_dir, 'install',
+ 				  'with_perl=no', 'with_python=no'],
+ 			lineBasedOutput=True)
+ 
+ 		d = self.waitFor(proc, EventMatcher(ProcessEndedEvent))
+ 		d.addCallback(self.makeTerminated)
+ 		proc.start()
+ 
+ 		# FIXME: how to properly handle these?
+ 		self.shell.addEnvPath(self.shell.getConfig('bindir'))
+ 		self.shell.addEnvLibraryPath(self.shell.getConfig('libdir'))
+ 		return d
+ 
+ 	def makeTerminated(self, event):
+ 		if event.exitCode != 0:
+ 			raise Exception("Initdb returned %d" % event.exitCode)
+ 		else:
+ 			return True
+ 
+ 	def tearDown(self):
+ 		# The installation procedure should be able to simply override any
+ 		# formerly installed files, so we save the time to clean up the
+ 		# installation directory.
+ 		return
+ 
+ 
+ class InitdbSuite(TestSuite):
+ 
+ 	args = (('number', int), )
+ 	needs = (('shell', "IShell or something"),)
+ 
+ 	def setUpDescription(self):
+ 		return "initializing database system %d" % self.number
+ 
+ 	def tearDownDescription(self):
+ 		return "removing database system %d" % self.number
+ 
+ 	def getNumber(self):
+ 		return self.number
+ 
+ 	def getDir(self):
+ 		return self.dbdir
+ 
+ 	def setUp(self):
+ 		self.dbdir = "%s%d" % \
+ 			(self.shell.getConfig('pgdata_prefix'), self.number)
+ 		proc = self.shell.runCommand(
+ 				'initdb-%d' % self.number,
+ 				'initdb', args = [
+ 				'initdb', '-D', self.dbdir,
+ 				'-A', 'trust', '--noclean'],
+ 				lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		proc.addHook(EventMatcher(ProcessEndedEvent),
+ 					 self.initdb_terminated, d)
+ 		proc.start()
+ 		return d
+ 
+ 	def initdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("Initdb returned %d" % event.exitCode))
+ 		else:
+ 			d.callback(True)
+ 
+ 	def tearDown(self):
+ 		self.shell.recursive_remove(
+ 			"%s%d" % (self.shell.getConfig('pgdata_prefix'), self.number))
+ 
+ 
+ class PostmasterSuite(TestSuite):
+ 
+ 	needs = (('shell', "IShell or something"),
+ 			 ('dbdir', "IDatabaseDir"),)
+ 
+ 	def setUpDescription(self):
+ 		return "starting database system %d" % self.dbdir.getNumber()
+ 
+ 	def tearDownDescription(self):
+ 		return "stopping database system %d" % self.dbdir.getNumber()
+ 
+ 	def getPort(self):
+ 		return self.port
+ 
+ 	def setUp(self):
+ 		setattr(self, 'getNumber', self.dbdir.getNumber)
+ 
+ 		self.port = self.shell.getConfig('temp-port') + self.dbdir.getNumber()
+ 
+ 		args = ['postmaster', '-d5',
+ 					'-D', self.dbdir.getDir(),
+ 					'-i', '-p', str(self.port)]
+ 		if self.shell.getConfig('enable_cassert'):
+ 			args += "-A1"
+ 
+ 		self.postmaster = self.shell.runCommand(
+ 			'postmaster%d' % self.dbdir.getNumber(),
+ 			'postmaster',
+ 			args = args,
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.readyHook = \
+ 			self.postmaster.addHook(EventMatcher(ProcessErrorEvent,
+ 				"database system is ready to accept connections"),
+ 				self.postmaster_ready, d)
+ 
+ 		self.unexpectedTerminationHook = \
+ 		  self.postmaster.addHook(EventMatcher(ProcessEndedEvent),
+ 								  self.postmaster_terminated)
+ 		self.postmaster.start()
+ 		return d
+ 
+ 	def postmaster_ready(self, event, d):
+ 		# it's sufficient if we're called once
+ 		self.postmaster.removeHook(self.readyHook)
+ 		d.callback(None)
+ 
+ 	def postmaster_terminated(self, event):
+ 		exitCode = 'undef'
+ 		if hasattr(event, 'exitCode'):
+ 			exitCode = event.exitCode
+ 		elif hasattr(event, 'data'):
+ 			exitCode = repr(event.data)
+ 		self.abort("postmaster %d unexpectedly terminated (exit code %s)" % \
+ 			(self.dbdir.getNumber(), exitCode))
+ 
+ 	def tearDown(self):
+ 		self.postmaster.removeHook(self.unexpectedTerminationHook)
+ 		if not self.aborted:
+ 			d = defer.Deferred()
+ 			self.postmaster.addHook(EventMatcher(ProcessEndedEvent),
+ 									lambda event: d.callback(None))
+ 			self.postmaster.stop()
+ 			return d
+ 		else:
+ 			return True
+ 
+ 
+ class TestDatabaseSuite(TestSuite):
+ 
+ 	args = (('dbname', str),)
+ 	needs = (('shell', "IShell or something"),
+ 			 ('pg', "IPostmaster"),)
+ 
+ 	def setUpDescription(self):
+ 		return "creating database %s at server %d" % \
+ 						(self.dbname, self.pg.getNumber())
+ 
+ 	def tearDownDescription(self):
+ 		return "not (!) dropping database %s at server %d" % \
+ 						(self.dbname, self.pg.getNumber())
+ 
+ 	def getDbname(self):
+ 		return self.dbname
+ 
+ 	def setUp(self):
+ 		setattr(self, "getPort", self.pg.getPort)
+ 		setattr(self, "getNumber", self.pg.getNumber)
+ 
+ 		self.proc = self.shell.runCommand(
+ 			'createdb%d' % self.pg.getNumber(),
+ 			'createdb',
+ 			args = ['createdb',
+ 					'-p', str(self.getPort()), self.dbname],
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.proc.addHook(EventMatcher(ProcessEndedEvent),
+ 						  self.createdb_terminated, d)
+ 		self.proc.start()
+ 		return d
+ 
+ 	def createdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("createdb terminated with code %d" % \
+ 				event.exitCode))
+ 		else:
+ 			d.callback(None)
+ 
+ 	def tearDown(self):
+ 		if self.pg.aborted:
+ 			return True
+ 
+ 		# Hm.. this interferes with the postmaster suites, which need
+ 		# to be started and stopped several times on top of a test database,
+ 		# however, creating and dropping it certainly depends on a running
+ 		# postmaster. Not sure how to solve this, at the moment I'm just
+ 		# skipping cleanup, i.e. dropdb.
+ 		return True
+ 
+ 		self.proc = self.shell.runCommand(
+ 			'dropdb%d' % self.pg.getNumber(),
+ 			'dropdb',
+ 			args = ['dropdb',
+ 					'-p', str(self.getPort()), self.dbname],
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.proc.addHook(EventMatcher(ProcessEndedEvent),
+ 						  self.dropdb_terminated, d)
+ 		self.proc.start()
+ 		return d
+ 
+ 	def dropdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("dropdb returned with %d" % \
+ 				event.exitCode))
+ 		else:
+ 			d.callback(None)
+ 
+ 
+ class SqlConnectionSuite(TestSuite):
+ 
+ 	args = (('dbname', str),)
+ 	needs = (('shell', "IShell or something"),
+ 			 ('db', "IPostmaster"))
+ 
+ 	def setUpDescription(self):
+ 		return "connecting to database %s at server %d" % \
+ 						(self.dbname, self.db.getNumber())
+ 	def tearDownDescription(self):
+ 		return "disconnecting from database %s at server %d" % \
+ 						(self.dbname, self.db.getNumber())
+ 
+ 	def getDbname(self):
+ 		return self.dbname
+ 
+ 	def setUp(self):
+ 		self.psql = self.shell.runCommand(
+ 			'psql%d' % self.db.getNumber(),
+ 			'psql',
+ 			args=['psql', '-AEn',
+ 				  '--pset=pager=off', '--pset=columns=0',
+ 				  '-p', str(self.db.getPort()),
+ 				  self.dbname])
+ 
+ 		# initialize the output buffer and attach a first output collector
+ 		# *before* the process is started.
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorDeferred = d
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector,
+ 			None, d)
+ 
+ 		# Mark as being in used, until we get to the commandline
+ 		self.inUse = True
+ 		self.workQueue = []
+ 
+ 		# also add a termination hook
+ 		self.unexpectedTerminationHook = self.psql.addHook(
+ 			EventMatcher(ProcessEndedEvent), self.psql_terminated)
+ 
+ 		# then schedule start of the psql process and return the deferred
+ 		# *before* starting the process.
+ 		reactor.callLater(0.0, self.psql.start)
+ 		return d
+ 
+ 	def psql_terminated(self, event):
+ 		exitCode = "undef"
+ 		if hasattr(event, 'exitCode'):
+ 			exitCode = event.exitCode
+ 		elif hasattr(event, 'data'):
+ 			exitCode = repr(event.data)
+ 
+ 		# If there's an outputCollectorHook, the abort method won't catch
+ 		# and we have to wait for the timeout to trigger, instead of
+ 		# acting on process termination. We thus save the outputCollector
+ 		# deferred and send it an errback with the failure.
+ 		if self.outputCollectorHook:
+ 			self.outputCollectorDeferred.errback( \
+ 				TestAborted("psql to server %d unexpectedly terminated (exit code %s)" % ( \
+ 					self.db.getNumber(), exitCode)))
+ 		self.abort(
+ 			"psql to server %d unexpectedly terminated (exit code %s)" % ( \
+ 				self.db.getNumber(), exitCode))
+ 
+ 	def tearDown(self):
+ 		self.psql.removeHook(self.unexpectedTerminationHook)
+ 
+ 		d = defer.Deferred()
+ 		self.psql.addHook(EventMatcher(ProcessEndedEvent),
+ 						  lambda event: d.callback(None))
+ 		reactor.callLater(0.0, self.psql.write, "\\q\n")
+ 		reactor.callLater(5.0, self.psql.stop)
+ 		return d
+ 
+ 	def outputCollector(self, event, query, d):
+ 		self.output_buffer += event.data
+ 
+ 		cmdprompt = self.dbname + '=#'
+ 		cpos = self.output_buffer.find(cmdprompt)
+ 
+ 		if cpos >= 0:
+ 			self.psql.removeHook(self.outputCollectorHook)
+ 			self.outputCollectorHook = False
+ 			result = self.output_buffer[:cpos]
+ 			self.output_buffer = self.output_buffer[cpos + len(cmdprompt):]
+ 			if len(self.output_buffer) > 0 and self.output_buffer != ' ':
+ 				print "rest: %s" % repr(self.output_buffer)
+ 			if d:
+ 				# remove the command prompt at the end
+ 				result = result[:cpos]
+ 
+ 				if query:
+ 					# remove the query string at the beginning
+ 					query_len = len(query)
+ 					if result[:query_len] != query:
+ 						raise Exception("Query not found at beginning of psql answer.")
+ 
+ 					result = result[query_len:]
+ 					while (len(result) > 1) and (result[0] in ("\n", "\r", " ")):
+ 						result = result[1:]
+ 				reactor.callLater(0.0, d.callback, result)
+ 
+ 			self.inUse = False
+ 			if len(self.workQueue) > 0:
+ 				assert not self.inUse
+ 				job = self.workQueue.pop()
+ 				d1 = job['method'](*job['args'])
+ 				d1.chainDeferred(job['deferred'])
+ 
+ 	def query(self, query):
+ 		if self.inUse:
+ 			d = defer.Deferred()
+ 			self.workQueue.append({'deferred': d,
+ 								   'method': self.query,
+ 								   'args': (query,)})
+ 			return d
+ 
+ 		assert not self.inUse
+ 		assert not self.outputCollectorHook
+ 
+ 		self.inUse = True
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector, query, d)
+ 		d.addCallback(self.parseQueryResult)
+ 
+ 		# defer writing to the process, so that the caller has the
+ 		# opportunity to add callbacks to the deferred we return.
+ 		reactor.callLater(0.0, self.psql.write, query + "\n")
+ 
+ 		return d
+ 
+ 	def parseQueryResult(self, result):
+ 		rawlines = result.split('\n')
+ 
+ 		lines = []
+ 		for line in rawlines:
+ 			line = line.strip()
+ 			if line.startswith("ROLLBACK"):
+ 				raise Exception("transaction rolled back (%s)" % query)
+ 			if line.startswith("message type"):
+ 				raise Exception("protocol error: %s" % line)
+ 			if len(line) > 0 and not line.startswith("NOTICE:") \
+ 				    and not line.startswith("ROLLBACK"):
+ 				lines.append(line)
+ 
+ 		try:
+ 			assert len(lines) >= 2
+ 
+ 			lines = map(lambda x: x.strip(), lines)
+ 			headLine = lines[0]
+ 			tailLine = lines[-1]
+ 
+ 			fields = headLine.split('|')
+ 			rows = []
+ 			for row in lines[1:-1]:
+ 				attrs = row.split('|')
+ 				assert len(attrs) == len(fields)
+ 				x = {}
+ 				for i in range(len(attrs)):
+ 					x[fields[i]] = attrs[i].strip()
+ 				rows.append(x)
+ 
+ 			x = re.compile("\((\d+) rows?\)").search(tailLine)
+ 			if x:
+ 				if not int(x.group(1)) == len(rows):
+ 					raise Exception("number of rows doesn't match: %s vs %d for: '%s'" % (
+ 						x.group(1), len(rows), lines))
+ 			else:
+ 				raise Exception("final number of rows line doesn't match.\n------------\n%s\n---------------\n" % lines)
+ 			return rows
+ 		except Exception, e:
+ 			import traceback
+ 			print "error parsing query result: %s" % e
+ 			traceback.print_exc()
+ 			raise e
+ 			# return []
+ 
+ 	def operation(self, query, expResult=None):
+ 		if self.inUse:
+ 			d = defer.Deferred()
+ 			self.workQueue.append({'deferred': d,
+ 								   'method': self.operation,
+ 								   'args': (query, expResult)})
+ 			return d
+ 
+ 		assert not self.inUse
+ 		assert not self.outputCollectorHook
+ 
+ 		self.inUse = True
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorDeferred = d
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector, query, d)
+ 		d.addCallback(self.checkQueryResult, query, expResult)
+ 
+ 		# defer writing to the process, so that the caller has the
+ 		# opportunity to add callbacks to the deferred we return.
+ 		reactor.callLater(0.0, self.psql.write, query + "\n")
+ 
+ 		return d
+ 
+ 	def checkQueryResult(self, result, query, expResult):
+ 		lines = []
+ 		for line in result.split("\n"):
+ 			line = line.strip()
+ 			if len(line) > 0 and not line.startswith("WARNING:") \
+ 							 and not line.startswith("NOTICE:"):
+ 				lines.append(line)
+ 		lines = "\n".join(lines)
+ 		if expResult:
+ 			if isinstance(expResult, str):
+ 				self.assertEqual(expResult, lines,
+ 					"didn't get expected result for query '%s'" % query)
+ 			elif isinstance(expResult, list):
+ 				if not lines in expResult:
+ 					raise TestFailure("didn't get expected result",
+ 									   "no result matches, got:\n%s\nfor query: '%s'\n" % (lines, query))
+ 		return lines
+ 
+ 
+ class TestDatabaseConnection(BaseTest):
+ 
+ 	needs = (('conn', "ISqlConnection"),)
+ 
+ 	description = "database connection"
+ 
+ 	def run(self):
+ 		return self.conn.query("SELECT 1 AS test;")
+ 
+ 
+ # FIXME: that's not actually a test, but it modifies the database state
+ class PopulateTestDatabase(BaseTest):
+ 
+ 	needs = (('conn', "ISqlConnection"),)
+ 
+ 	description = "populate test database"
+ 
+ 	def run(self):
+ 		conn = self.conn
+ 
+ 		# Create a test table for use in TestConcurrentUpdates and fill it
+ 		# with two test tuples.
+ 		d = conn.operation("CREATE TABLE test (i int PRIMARY KEY, t text);",
+ 						   "CREATE TABLE")
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (5, 'apple');",
+ 			"INSERT 0 1"))
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (7, 'pear');",
+ 			"INSERT 0 1"))
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (11, 'banana');",
+ 			"INSERT 0 1"))
+ 		return d
+ 
+ 
+ class PermutationTest(SyncTest):
+ 	"""	Abstract class for testing a set of steps in all permutations of execution order.
+ 		This counts as a single test, although a subclass may accumulate counts which may be of
+ 		interest, and should therefore be shown regardless of success or failure of the test.
+ 	"""
+ 
+ 	# stepDictionary maps a step ID to a function to run for that step.
+ 	stepDictionary = {}
+ 
+ 	# stepThreading is a list of lists.
+ 	# All permutations of interleaving of steps from the sublists will be generated.
+ 	# Steps from within each sublist are kept in order; only the interleaving is variable.
+ 	stepThreading = [[]]
+ 
+ 	# Override this to provide any per-iteration (permutation) setup.
+ 	def setUpIteration(self, stepIdList):
+ 		pass
+ 
+ 	# Override this to provide any per-iteration (permutation) teardown.
+ 	def tearDownIteration(self, stepIdList):
+ 		pass
+ 
+ 	def runIterationStep(self, stepId):
+ 		p = self.stepDictionary[stepId]
+ 		p()
+ 
+ 	def runIterationSteps(self, stepIdList):
+ 		try:
+ 			self.setUpIteration(stepIdList)
+ 			for stepId in stepIdList:
+ 				self.runIterationStep(stepId)
+ 		finally:
+ 			self.tearDownIteration(stepIdList)
+ 
+ 	def runPermutations(self, a):
+ 		self.runPermutations_recurse([], a)
+ 
+ 	def runPermutations_recurse(self, p, a):
+ 		found = False
+ 		for i in range(len(a)):
+ 			if len(a[i]) > 0:
+ 				found = True
+ 				r = p[:]
+ 				b = a[:]
+ 				r.append(b[i][0])
+ 				b[i] = b[i][1:]
+ 				self.runPermutations_recurse(r, b)
+ 		if not found:
+ 			self.runIterationSteps(p)
+ 
+ 	# If the dictionary is set up in this method, there can be references
+ 	# to class methods and fields.
+ 	def populateStepDictionary(self):
+ 		pass
+ 
+ 	def run(self):
+ 		self.populateStepDictionary()
+ 		self.runPermutations(self.stepThreading)
+ 		# The last two lines of output for the last entry seem to disappear???
+ 		print
+ 		print
+ 
+ 
+ class DummyPermutationTest(PermutationTest):
+ 	"""	Simple test of the PermutationTest abstract class.
+ 	"""
+ 
+ 	description = "simple test of the PermutationTest abstract class"
+ 
+ 	stepThreading = [['r1x','c1'],['r2x','c2']]
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		print
+ 
+ 	def printStepId(self, stepId):
+ 		print stepId,
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'r1x': lambda : self.printStepId('r1x'),
+ 			'c1': lambda : self.printStepId('c1'),
+ 			'r2x': lambda : self.printStepId('r2x'),
+ 			'c2': lambda : self.printStepId('c2')
+ 			}
+ 
+ 
+ class DatabasePermutationTest(PermutationTest):
+ 	""" Abstract class to provide framework for using an IterativeTest for database queries.
+ 	"""
+ 
+ 	commitRequiredCount = 0
+ 	commitRequiredOK = 0
+ 	rollbackRequiredCount = 0
+ 	rollbackRequiredOK = 0
+ 	commitPreferredCount = 0
+ 	commitPreferredOK = 0
+ 
+ 	serializationFailure = False
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return True
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return False
+ 
+ 	def countProgress(self, stepIdList):
+ 		if self.rollbackRequired(stepIdList):
+ 			self.rollbackRequiredCount += 1
+ 			if self.serializationFailure:
+ 				self.rollbackRequiredOK += 1
+ 		else:
+ 			if self.commitRequired(stepIdList):
+ 				self.commitRequiredCount += 1
+ 				if not self.serializationFailure:
+ 					self.commitRequiredOK += 1
+ 			else:
+ 				self.commitPreferredCount += 1
+ 				if not self.serializationFailure:
+ 					self.commitPreferredOK += 1
+ 
+ 	def runIterationSteps(self, stepIdList):
+ 		try:
+ 			self.setUpIteration(stepIdList)
+ 			for stepId in stepIdList:
+ 				self.runIterationStep(stepId)
+ 			self.countProgress(stepIdList)
+ 		finally:
+ 			self.tearDownIteration(stepIdList)
+ 
+ 	def tryOperation(self, conn, sql):
+ 		result = self.syncCall(10, conn.operation, sql),
+ 		for line in result:
+ 			if len(line) > 0 and line.startswith("ERROR:  could not serialize"):
+ 				self.serializationFailure = True
+ 			else:
+ 				if len(line) > 0 and line.startswith("ERROR:"):
+ 					raise TestFailure("failure other than serializable encountered: " + line, line)
+ 
+ 	def printStatistics(self):
+ 		print 'rollback required: ', self.rollbackRequiredOK, '/', self.rollbackRequiredCount
+ 		print 'commit required: ', self.commitRequiredOK, '/', self.commitRequiredCount
+ 		print 'commit preferred: ', self.commitPreferredOK, '/', self.commitPreferredCount
+ 
+ 	def run(self):
+ 		self.populateStepDictionary()
+ 		self.runPermutations(self.stepThreading)
+ 		self.printStatistics()
+ 		# The last two lines of output for the last entry seem to disappear???
+ 		print
+ 		print
+ 		if self.rollbackRequiredOK < self.rollbackRequiredCount:
+ 			raise TestFailure("serialization anomalies incorrectly allowed",
+ 				"Database integrity not protected.")
+ 		if self.commitRequiredOK < self.commitRequiredCount:
+ 			raise TestFailure("serialization failure occurred when it should not have",
+ 				"Transactions we thought we knew how to recognize as safe resulted in a rollback..")
+ 
+ 	def printStepResults(self, stepIdList):
+ 		if self.serializationFailure:
+ 			if self.commitRequired(stepIdList):
+ 				print 'rolled back ??'
+ 			else:
+ 				if not self.rollbackRequired(stepIdList):
+ 					print 'rolled back ?'
+ 				else:
+ 					print 'rolled back'
+ 		else:
+ 			if self.rollbackRequired(stepIdList):
+ 				print 'committed ***'
+ 			else:
+ 				print 'committed'
+ 
+ 
+ class SimpleWriteSkewTest(DatabasePermutationTest):
+ 	"""	Write skew test.
+ 		This test has two serializable transactions: one which updates all
+ 		'apple' rows to 'pear' and one which updates all 'pear' rows to
+ 		'apple'.  If these were serialized (run one at a time) either
+ 		value could be present, but not both.  One must be rolled back to
+ 		prevent the write skew anomaly.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "write skew test"
+ 
+ 	stepThreading = [['rwx1','c1'],['rwx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rwx1': lambda : self.tryOperation(self.conn1, "UPDATE test SET t = 'apple' WHERE t = 'pear';"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'rwx2': lambda : self.tryOperation(self.conn2, "UPDATE test SET t = 'pear' WHERE t = 'apple';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList,
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'apple' WHERE i = 5;", "UPDATE 1")
+ 		self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'pear' WHERE i = 7;", "UPDATE 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (stepIdList.index('c1') < stepIdList.index('rwx2')
+ 				or stepIdList.index('c2') < stepIdList.index('rwx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ReceiptReportTest(DatabasePermutationTest):
+ 	"""	Daily Report of Receipts test.
+ 		This test doesn't persist a bad state in the database; rather, it
+ 		provides a view of the data which is not consistent with any
+ 		order of execution of the serializable transactions.  It
+ 		demonstrates a situation where the deposit date for receipts could
+ 		be changed and a report of the closed day's receipts subsequently
+ 		run which will miss a receipt from the date which has been closed.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'),
+ 			 ('conn3', 'ISqlConnection'))
+ 
+ 	description = "daily report of receipts test"
+ 
+ 	stepThreading = [['rxwy1','c1'],['wx2','c2'],['rx3','ry3','c3']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rxwy1': lambda : self.tryOperation(self.conn1, "INSERT INTO receipt VALUES (3, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 4.00);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE ctl SET deposit_date = DATE '2008-12-23' WHERE k = 'receipt';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;"),
+ 			'rx3': lambda : self.tryOperation(self.conn3, "SELECT * FROM ctl WHERE k = 'receipt';"),
+ 			'ry3': lambda : self.tryOperation(self.conn3, "SELECT * FROM receipt WHERE deposit_date = DATE '2008-12-22';"),
+ 			'c3': lambda : self.tryOperation(self.conn3, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList,
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS ctl, receipt;")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE ctl (k text NOT NULL PRIMARY KEY, deposit_date date NOT NULL);")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO ctl VALUES ('receipt', DATE '2008-12-22');")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE receipt (receipt_no int NOT NULL PRIMARY KEY, deposit_date date NOT NULL, amount numeric(13,2));")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (1, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 1.00);")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (2, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 2.00);")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn3.operation, "BEGIN TRANSACTION READ ONLY ISOLATION LEVEL SERIALIZABLE READ ONLY;", "BEGIN")
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn3.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   (stepIdList.index('c1') < stepIdList.index('wx2')
+ 					and stepIdList.index('c1') < stepIdList.index('rx3'))
+ 				or (stepIdList.index('c2') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c2') < stepIdList.index('rx3'))
+ 				or (stepIdList.index('c3') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c3') < stepIdList.index('wx2'))
+ 				or (stepIdList.index('c2') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c3') < stepIdList.index('rxwy1'))
+ 				or (stepIdList.index('c1') < stepIdList.index('wx2')
+ 					and stepIdList.index('c3') < stepIdList.index('wx2'))
+ 				or (stepIdList.index('c1') < stepIdList.index('rx3')
+ 					and stepIdList.index('c2') < stepIdList.index('rx3')))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return ((stepIdList.index('c2') < stepIdList.index('c1')
+ 				and stepIdList.index('c2') < stepIdList.index('c3')
+ 				and stepIdList.index('rxwy1') < stepIdList.index('c2')
+ 				and stepIdList.index('rx3') < stepIdList.index('c1')
+ 				#############################################################
+ 				# The following test excludes some rows from rollback
+ 				# required for which we know our current SSI algorithm
+ 				# requires a rollback, but which don't, in fact, cause
+ 				# any anomaly.  If we determine that we can allow pivots
+ 				# in which conflictIn and conflictOut are separate and
+ 				# overlapping transactions, these can be committed.
+ 				# To include these permutations in the "rollback required"
+ 				# count, comment out the following line.
+ 				and stepIdList.index('c2') < stepIdList.index('rx3')
+ 				#############################################################
+ 				)
+ 
+ 				#############################################################
+ 				# An anomaly can't actually occur based on the following
+ 				# "or" clause, but we know that our algorithm can't
+ 				# currently detect that, because T2's conflictIn is set
+ 				# to a self-reference because of multiple conflicts.
+ 				# To count these in the "rollback required" list, uncomment
+ 				# this section; otherwise they are "commit preferred"..
+ 				# or (stepIdList.index('rxwy1') < stepIdList.index('c1')
+ 				#	and stepIdList.index('rxwy1') < stepIdList.index('c2')
+ 				#	and stepIdList.index('rxwy1') < stepIdList.index('c3')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c1')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c2')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c3')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c1')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c2')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c3')
+ 				#	)
+ 				#############################################################
+ 			   )
+ 
+ 
+ class TemporalRangeIntegrityTest(DatabasePermutationTest):
+ 	"""	Temporal range integrity test.
+ 		Snapshot integrity fails with simple referential integrity tests,
+ 		but those don't make for good demonstrations because people just
+ 		say that foreign key definitions should be used instead.  There
+ 		are many integrity tests which are conceptually very similar but
+ 		don't have built-in support which will fail when used in triggers.
+ 		This is intended to illustrate such cases.  It is obviously very
+ 		hard to exercise all these permutations when the code is actually
+ 		in a trigger; this test pulls what would normally be inside of
+ 		triggers out to the top level to control the permutations.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "temporal range integrity test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date <= DATE '2009-05-15' AND (exp_date IS NULL OR exp_date > DATE '2009-05-15');"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO offense VALUES (1, '123.45(1)a', DATE '2009-05-15');"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM offense WHERE statute_cite = '123.45(1)a' AND offense_date >= DATE '2008-01-01';"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date = DATE '2008-01-01';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS statute, offense;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE statute (statute_cite text NOT NULL, eff_date date NOT NULL, exp_date date, CONSTRAINT statute_pkey PRIMARY KEY (statute_cite, eff_date));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO statute VALUES ('123.45(1)a', DATE '2008-01-01', NULL);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE offense (offense_no int NOT NULL, statute_cite text NOT NULL, offense_date date NOT NULL, CONSTRAINT offense_pkey PRIMARY KEY (offense_no));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ProjectManagerTest(DatabasePermutationTest):
+ 	"""	Project manager test.
+ 		Ensure that the person who is on the project as a manager
+ 		is flagged as a project manager in the person table.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "project manager test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM person WHERE person_id = 1 AND is_project_manager;"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO project VALUES (101, 'Build Great Wall', 1);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM project WHERE project_manager = 1;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE person SET is_project_manager = false WHERE person_id = 1;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS person, project;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE person (person_id int NOT NULL PRIMARY KEY, name text NOT NULL, is_project_manager bool NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO person VALUES (1, 'Robert Haas', true);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE project (project_no int NOT NULL PRIMARY KEY, description text NOT NULL, project_manager int NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ClassroomSchedulingTest(DatabasePermutationTest):
+ 	"""	Classroom scheduling test.
+ 		Ensure that the classroom is not scheduled more than once
+ 		for any moment in time.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "classroom scheduling test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:00' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:00';"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 13:00', TIMESTAMP WITH TIME ZONE '2010-04-01 14:00', 'Carol');"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:30';"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE room_reservation SET start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 13:30', end_time = TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' WHERE room_id = '101' AND start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 10:00';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS room_reservation;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE room_reservation (room_id text NOT NULL, start_time timestamp with time zone NOT NULL, end_time timestamp with time zone NOT NULL, description text NOT NULL, CONSTRAINT room_reservation_pkey PRIMARY KEY (room_id, start_time));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 10:00', TIMESTAMP WITH TIME ZONE '2010-04-01 11:00', 'Bob');", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class TotalCashTest(DatabasePermutationTest):
+ 	"""	Total cash test.
+ 		Another famous test of snapshot isolation anomaly.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "total cash test"
+ 
+ 	stepThreading = [['wx1','rxy1','c1'],['wy2','rxy2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'wx1': lambda : self.tryOperation(self.conn1, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'checking';"),
+ 			'rxy1': lambda : self.tryOperation(self.conn1, "SELECT SUM(balance) FROM accounts;"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'wy2': lambda : self.tryOperation(self.conn2, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'savings';"),
+ 			'rxy2': lambda : self.tryOperation(self.conn2, "SELECT SUM(balance) FROM accounts;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS accounts;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE accounts (accountid text NOT NULL PRIMARY KEY, balance numeric not null);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO accounts VALUES ('checking', 600),('savings',600);", "INSERT 0 2")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('wy2')
+ 				or stepIdList.index('c2') < stepIdList.index('wx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ReferentialIntegrityTest(DatabasePermutationTest):
+ 	"""	Referential integrity test.
+ 		The assumption here is that the application code issuing the SELECT
+ 		to test for the presence or absence of a related record would do the
+ 		right thing -- this script doesn't include that logic.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "referential integrity test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['rx2','ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT i FROM a WHERE i = 1;"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO b VALUES (1);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'rx2': lambda : self.tryOperation(self.conn2, "SELECT i FROM a WHERE i = 1;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT a_id FROM b WHERE a_id = 1;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM a WHERE i = 1;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS a, b;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE a (i int PRIMARY KEY);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE b (a_id int);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO a VALUES (1);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('rx2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class RITriggerTest(DatabasePermutationTest):
+ 	"""	Referential integrity trigger test.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "referential integrity trigger test"
+ 
+ 	stepThreading = [['wxry1','c1'],['r2','wyrx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'wxry1': lambda : self.tryOperation(self.conn1, "INSERT INTO child (parent_id) VALUES (0);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'r2': lambda : self.tryOperation(self.conn2, "SELECT TRUE;"),
+ 			'wyrx2': lambda : self.tryOperation(self.conn2, "DELETE FROM parent WHERE parent_id = 0;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS parent, child;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE parent (parent_id SERIAL NOT NULL PRIMARY KEY);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE child (child_id SERIAL NOT NULL PRIMARY KEY, parent_id INTEGER NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_parent() RETURNS TRIGGER AS $body$\
+ BEGIN\
+   PERFORM TRUE FROM child WHERE parent_id = OLD.parent_id;\
+   IF FOUND THEN\
+     RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || OLD.parent_id || ' still referenced during ' || TG_OP;\
+   END IF;\
+   RETURN NULL;\
+ END;\
+ $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_parent AFTER UPDATE OR DELETE ON parent FOR EACH ROW EXECUTE PROCEDURE ri_parent();", "CREATE TRIGGER")
+ 		self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_child() RETURNS TRIGGER AS $body$\
+ BEGIN\
+   PERFORM TRUE FROM parent WHERE parent_id = NEW.parent_id;\
+   IF NOT FOUND THEN\
+     RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || NEW.parent_id || ' does not exist during ' || TG_OP;\
+   END IF;\
+   RETURN NULL;\
+ END;\
+ $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_child AFTER INSERT OR UPDATE ON child FOR EACH ROW EXECUTE PROCEDURE ri_child();", "CREATE TRIGGER")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO parent VALUES(0);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	# Override the normal method to allow failures generated by the trigger code
+ 	# to be considered "success".  Just so we can count things up.
+ 	def tryOperation(self, conn, sql):
+ 		result = self.syncCall(10, conn.operation, sql),
+ 		for line in result:
+ 			if len(line) > 0 and line.startswith("ERROR:  could not serialize"):
+ 				self.serializationFailure = True
+ 			else:
+ 				if (len(line) > 0 and line.startswith("ERROR:")
+ 				and len(line) > 0 and not line.startswith("ERROR:  Parent 0 ")):
+ 					raise TestFailure("failure other than serializable encountered: " + line, line)
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('r2')
+ 				or stepIdList.index('c2') < stepIdList.index('wxry1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class TestTrueSerializabilityConcurrentUpdates(SyncTest):
+ 	""" Runs three transactions concurrently, each reading from what the
+ 		other writes in turn. Should raise a serialization failure, but
+ 		instead leads to wrong results, ATM.
+ 	"""
+ 
+ 	description = "concurrent updates"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'),
+ 			 ('conn3', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def readValueThenWrite(self, conn, readFromId, writeToId):
+ 		d = conn.query("SELECT t FROM test WHERE i = %d;" % readFromId)
+ 		d.addCallback(self.writeValueBack, conn, writeToId)
+ 		return d
+ 
+ 	def writeValueBack(self, result, conn, writeToId):
+ 		self.assertEqual(1, len(result),
+ 						 "expected exactly one result row")
+ 		row = result[0]
+ 		self.assertEqual(1, len(row),
+ 						 "expected exactly one column")
+ 		value = row['t']
+ 		d = conn.operation("UPDATE test SET t = '%s' WHERE i = %d;" % (value, writeToId),
+ 						   "UPDATE")
+ 		return d
+ 
+ 	def startConcurrentOperations(self):
+ 		d1 = self.readValueThenWrite(self.conn1, readFromId=5,  writeToId=7)
+ 		d2 = self.readValueThenWrite(self.conn2, readFromId=7,  writeToId=11)
+ 		d3 = self.readValueThenWrite(self.conn3, readFromId=11, writeToId=5)
+ 		return defer.DeferredList([d1, d2, d3],
+ 								  consumeErrors=True, fireOnOneErrback=True)
+ 
+ 	def run(self):
+ 		try:
+ 			self.sub_run()
+ 		finally:
+ 			self.syncCall(10, self.execOnAllConnections, "ROLLBACK;")
+ 
+ 	def sub_run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2,
+ 			self.conn3]
+ 
+ 		# begin a transaction on all three connections
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET")
+ 
+ 		# concurrently let each of the three transactions read a value and
+ 		# write that to another tuple, wait for all the UPDATEs to complete
+ 		# before trying to commit any of the transactions
+ 		self.syncCall(10, self.startConcurrentOperations)
+ 
+ 		# try to commit all three transactions (accepting both COMMIT or
+ 		# ERROR, we check the result later on).
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"COMMIT;", "COMMIT|ERROR");
+ 
+ 		# count the occurrance of each fruit
+ 		result = self.syncCall(10, self.conn1.query,
+ 			"SELECT t FROM test WHERE i IN (5, 7, 11);")
+ 		counters = {'banana': 0, 'apple': 0, 'pear': 0}
+ 		for row in result:
+ 			counters[row['t']] += 1
+ 
+ 		# you currently get one fruit each, as no transaction gets aborted,
+ 		# which is impossible if the transactions had been executed one
+ 		# after another.
+ 		if counters.values() == [1, 1, 1]:
+ 			raise TestFailure("conflict not detected",
+ 				"All transactions committed, so the conflict hasn't been detected.")
+ 
+ class TestTrueSerializabilityConcurrentInsert(BaseTest):
+ 	""" Runs two transactions, both doing an insert, first, then select
+ 		all the relevant rows (within the range 100 <= i < 110). We let the
+ 		first transaction commit before creating the cyclic dependency,
+ 		which forces transaction 2 to abort.
+ 	"""
+ 
+ 	description = "concurrent insert"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2]
+ 
+ 		# begin a transaction on all three connections
+ 		d = self.execOnAllConnections("BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		d.addCallback(lambda x:
+ 			self.execOnAllConnections(
+ 				"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET"))
+ 
+ 		# let transaction 1 do an insert (so it acquires a snapshot)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"INSERT INTO test (i, t) VALUES (101, 'orange');", "INSERT 0 1"))
+ 
+ 		# then same for transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"INSERT INTO test (i, t) VALUES (102, 'grapefruit');", "INSERT 0 1"))
+ 
+ 		# let transaction 1 read the relevant rows, so it acquires an SIREAD
+ 		# lock on the predicate. (The result is discarded).
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;"))
+ 
+ 		# then commit transaction 1 (which should still succeed)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# try to read all rows with the second transaction's snapshot (which
+ 		# doesn't see the update of transaction 1)
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;"))
+ 
+ 		# With SSI in place, this should lock the same predicate with an
+ 		# SIREAD lock, which should bomb out on the orange (tuple i = 101)
+ 		# from transaction 1.
+ 		#
+ 		# dtester FIXME: Hm.. this could need some "expect to fail" help
+ 		#                from dtester
+ 		d.addCallback(self.checkResult)
+ 
+ 		# cleanup both transactions, especially in case of failure
+ 		d.addBoth(self.cleanup)
+ 
+ 		return d
+ 
+ 	def checkResult(self, result):
+ 		if not isinstance(result, failure.Failure):
+ 			raise TestFailure("conflict not detected",
+ 				"SELECT should raise a serialization error")
+ 		return result
+ 
+ 	def cleanup(self, result):
+ 		d = self.execOnAllConnections("ROLLBACK;")
+ 
+ 		# ignore errors above, but instead make sure we return the result
+ 		# we got here, especially if it was an error.
+ 		d.addBoth(lambda x: result)
+ 		return d
+ 
+ class TestTrueSerializabilityConcurrentInsert2(BaseTest):
+ 	""" Pretty similar to the above test, except that the first transaction
+ 		doesn't read (and thus predicate lock) the relevant rows. This still
+ 		leaves a possible serialization ordering, even if it doesn't match
+ 		the real commit ordering.
+ 
+ 		Uses rows 200 <= i < 210
+ 	"""
+ 
+ 	description = "concurrent insert"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2]
+ 
+ 		# begin a transaction on all three connections
+ 		d = self.execOnAllConnections("BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		d.addCallback(lambda x:
+ 			self.execOnAllConnections(
+ 				"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET"))
+ 
+ 		# let transaction 1 do an insert (so it acquires a snapshot)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"INSERT INTO test (i, t) VALUES (201, 'orange');", "INSERT 0 1"))
+ 
+ 		# then same for transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"INSERT INTO test (i, t) VALUES (202, 'grapefruit');", "INSERT 0 1"))
+ 
+ 		# no SELECT here, so transaction 1 doesn't acquire any SIREAD lock
+ 
+ 		# then commit transaction 1 (which should succeed)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# try to read all rows with the second transaction's snapshot (which
+ 		# doesn't see the update of transaction 1)
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 200 AND i < 210;"))
+ 
+ 		# With SSI in place, this should lock the same predicate as abover
+ 		# with an SIREAD lock. This includes the row just written by the
+ 		# first transaction.
+ 		#
+ 		# As long as there are no other edges, this still leaves a possible
+ 		# serialization ordering: if we executed the second transaction
+ 		# *before* the first one, the second didn't see the 'orange' row
+ 		# inserted "later" by the first transaction. That's the result we
+ 		# expect.
+ 		d.addCallback(self.checkResult)
+ 
+ 		# commit transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# add a cleanup handler
+ 		d.addErrback(self.cleanup)
+ 
+ 		return d
+ 
+ 	def checkResult(self, result):
+ 		self.assertEqual(len(result), 1,
+ 			"Expected exactly one row, got %d (%s)" % (
+ 				len(result), repr(result)))
+ 		self.assertEqual(result[0], {"t": "grapefruit"},
+ 			"Expected to read the grapefruit row, but got %s" % (result[0],))
+ 
+ 		return result
+ 
+ 	def cleanup(self, result):
+ 		d = self.execOnAllConnections("ROLLBACK;")
+ 
+ 		# ignore errors above, but instead make sure we return the result
+ 		# we got here, especially if it was an error.
+ 		d.addBoth(lambda x: result)
+ 		return d
+ 
+ 
+ # ******  test running code  ************************************************
+ 
+ class Logger(object):
+ 	""" A simplistic logger that just writes it all into one single file.
+ 	"""
+ 	def __init__(self, logFileName):
+ 		self.logfile = open(logFileName, 'w')
+ 
+ 	def __del__(self):
+ 		self.logfile.close()
+ 
+ 	def callback(self, event):
+ 		self.logfile.write(str(event) + "\n")
+ 		self.logfile.flush()
+ 
+ def main(argv):
+ 	print "Postgres dtester suite                Copyright (c) 2004-2010, by Markus Wanner\n"
+ 
+ 	postgres_configure_args = "@configure_args@"
+ 
+ 	config = {
+ 			'temp-port': 65432,
+ 
+ 			# by default, use the same installation directory as make check
+ 			'inst_dir': os.path.join(os.getcwd(), 'tmp_check/install'),
+ 
+ 			# and a similar prefix
+ 			'pgdata_prefix': os.path.join(os.getcwd(), 'tmp_check/data-dtester'),
+ 			'logfile' : os.path.join(os.getcwd(), 'dtester.log'),
+ 
+ 			'enable_cassert': 'enable_cassert' in postgres_configure_args
+ 	}
+ 
+ 	try:
+ 		opts, args = getopt.getopt(argv,
+ 			"h",
+ 			["help", "temp-install", "top-builddir=", "temp-port=",
+ 			 "multibyte="])
+ 	except getopt.GetoptError:
+ 		usage()
+ 		sys.exit(2)
+ 
+ 	for opt, arg in opts:
+ 		if opt in ("-h", "--help"):
+ 			usage()
+ 			sys.exit()
+ 		elif opt in ("--temp-install"):
+ 			config["temp-install"] = True
+ 		elif opt in ("--temp-port"):
+ 			try:
+ 				arg = int(arg)
+ 				if arg >= 1024 and arg <= 65535:
+ 					config["temp-port"] = arg
+ 				else:
+ 					print "temp-port out of range."
+ 					sys.exit(2)
+ 			except ValueError:
+ 				print "Fatal: invalid temp-port specified"
+ 				sys.exit(2)
+ 		elif opt in ("--top-builddir"):
+ 			config["top-builddir"] = arg
+ 
+ 
+ 	if not config.has_key('bindir'):
+ 		bindir = '@bindir@'
+ 		if bindir[0] == '/':
+ 			bindir = bindir[1:]
+ 		config['bindir'] = os.path.join(config['inst_dir'], bindir)
+ 	if not config.has_key('libdir'):
+ 		libdir = '@libdir@'
+ 		if libdir[0] == '/':
+ 			libdir = libdir[1:]
+ 		config['libdir'] = os.path.join(config['inst_dir'], libdir)
+ 	if not config.has_key('datadir'):
+ 		datadir = '@datadir@'
+ 		if datadir[0] == '/':
+ 			datadir = datadir[1:]
+ 		config['datadir'] = os.path.join(config['inst_dir'], datadir)
+ 
+ 
+ 	# FIXME: should not have to be here
+ 	logger = Logger(config['logfile'])
+ 	config['main_logging_hook'] = (EventMatcher(Event), logger.callback)
+ 
+ 
+ 	# definition of tests and suites, including their dependencies
+ 	tdef = {
+ 		# runs 'make install' to make sure the installation is up to date
+ 		'temp_install':		{'class': InstallationSuite,
+ 							 'uses': ('__system__',)},
+ 
+ 		# runs initdb, providing the Postgres data directory
+ 		'initdb-0':			{'class': InitdbSuite,
+ 							 'uses': ('temp_install',),
+ 							 'args': (0,)},
+ 
+ 		# runs a postmaster on the created database directory
+ 		'pg-0':				{'class': PostmasterSuite,
+ 							 'uses': ('temp_install', 'initdb-0')},
+ 
+ 		# creates a test database on pg-0
+ 		'testdb':			{'class': TestDatabaseSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',)},
+ 
+ 		# open two connections
+ 		'conn-0A':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 		'conn-0B':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 		'conn-0C':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 
+ 		# test the connections
+ 		'test-conn-0A':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0A',)},
+ 		'test-conn-0B':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0B',)},
+ 		'test-conn-0C':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0C',)},
+ 
+ #		'dummy-recursion':	{'class': DummyPermutationTest},
+ 
+ 		# populate the test database
+ 		'populate-testdb':	{'class': PopulateTestDatabase,
+ 							 'uses': ('conn-0A',),
+ 							 'onlyAfter': ('test-conn-0A', 'test-conn-0B',
+ 										   'test-conn-0C')},
+ 
+ 		'simple-write-skew':	{'class': SimpleWriteSkewTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('populate-testdb',),
+ 							 'xfail': True},
+ 
+ 		'receipt-report':	{'class': ReceiptReportTest,
+ 							 'uses': ('conn-0A', 'conn-0B', 'conn-0C'),
+ 							 'onlyAfter': ('simple-write-skew',),
+ 							 'xfail': True},
+ 
+ 		'temporal-range':	{'class': TemporalRangeIntegrityTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('receipt-report',),
+ 							 'xfail': True},
+ 
+ 		'project-manager':	{'class': ProjectManagerTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('temporal-range',),
+ 							 'xfail': True},
+ 
+ 		'classroom-scheduling':	{'class': ClassroomSchedulingTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('project-manager',),
+ 							 'xfail': True},
+ 
+ 		'total-cash':		{'class': TotalCashTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('classroom-scheduling',),
+ 							 'xfail': True},
+ 
+ 		'referential-integrity':	{'class': ReferentialIntegrityTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('total-cash',),
+ 							 'xfail': True},
+ 
+ 		'ri-trigger':		{'class': RITriggerTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('referential-integrity',),
+ 							 'xfail': True}
+ 
+ #		'ser-updates':		{'class': TestTrueSerializabilityConcurrentUpdates,
+ #							 'uses': ('conn-0A', 'conn-0B', 'conn-0C'),
+ #							 'onlyAfter': ('populate-testdb',),
+ #							 'xfail': True},
+ #
+ #		'ser-insert':		{'class': TestTrueSerializabilityConcurrentInsert,
+ #							 'uses': ('conn-0A', 'conn-0B'),
+ #							 'onlyAfter': ('ser-updates',),
+ #							 'xfail': True},
+ #
+ #		'ser-insert2':		{'class': TestTrueSerializabilityConcurrentInsert2,
+ #							 'uses': ('conn-0A', 'conn-0B'),
+ #							 'onlyAfter': ('ser-insert',)}
+ 	}
+ 
+ 
+ 	runner = Runner(testTimeout=600, suiteTimeout=3600)
+ 	runner.run(tdef, config)
+ 
+ 
+ if __name__ == "__main__":
+ 	main(sys.argv[1:])
+

Selena Deckelmann

selenamarie@gmail.com

over 15 years ago

In reply to: Kevin Grittner (#1)

Re: WIP patch for serializable transactions with predicate locking

On Wed, May 19, 2010 at 5:37 PM, Kevin Grittner
<Kevin.Grittner@wicourts.gov> wrote:

I heard that others were considering work on predicate locks for
9.1. Since Dan Ports of MIT and I have been working on that for the
serializable implementation for the last few weeks, I felt it would
be good to post a WIP patch to avoid duplicate effort.

I added this to the next commitfest with the 'WIP' prominent.

I figured it was worth including for initial reviews, although of
course, detailed work will likely wait until July.

-selena

--
http://chesnok.com/daily - me

Kevin Grittner

Kevin.Grittner@wicourts.gov

over 15 years ago

In reply to: Kevin Grittner (#1)

1 attachment(s)

Re: WIP patch for serializable transactions with predicate locking

Attached is an updated patch to correct for bitrot and include the
latest work.

Whoever reviews this will probably want to review the Serializable
Wiki page:

http://wiki.postgresql.org/wiki/Serializable

Some areas on which I would particularly appreciate feedback in the
initial review:

CODE ORGANIZATION: I started with predicate.c based on the lock.c
code, and so fell into the src/backend/storage/lmgr directory. It
has morphed into something which probably doesn't belong in that
directory, but I'm not sure where it *does* belong. An argument
could be made that the SSI logic should be split from the predicate
locking, except that there is so little code that isn't part of
tracking the predicate locks and their conflicts, I'm not sure about
that either. In a similar vein, the structures in predicate.h are
used in exactly one place outside of predicate.c; it might make
sense to split that .h file so that most places only bring in the
function prototypes, which is all they need. Or perhaps the code
added to lockfuncs.c should be moved to the new predicate.c file,
and called from lockfuncs.c?

NAMING: The names in predicate.c are somewhat inconsistent, and a
few of them outright annoy me -- particularly MyXxx in a structure
field. I want to do a pass to make the names more consistent, but
would appreciate any feedback on what's good, bad, or ugly in the
current code before I do.

CORNER CASES: What did we forget to handle?

-Kevin

Attachments:

serializable-2.patchtext/plain; name=serializable-2.patchDownload

*** a/GNUmakefile.in
--- b/GNUmakefile.in
***************
*** 75,81 **** distclean maintainer-clean:
  
  check: all
  
! check installcheck installcheck-parallel:
  	$(MAKE) -C src/test $@
  
  installcheck-world:
--- 75,81 ----
  
  check: all
  
! check dcheck installcheck installcheck-parallel:
  	$(MAKE) -C src/test $@
  
  installcheck-world:
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
***************
*** 57,62 ****
--- 57,63 ----
  #include "storage/bufmgr.h"
  #include "storage/freespace.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "storage/standby.h"
***************
*** 261,280 **** heapgetpage(HeapScanDesc scan, BlockNumber page)
  	{
  		if (ItemIdIsNormal(lpp))
  		{
  			bool		valid;
  
  			if (all_visible)
  				valid = true;
  			else
  			{
- 				HeapTupleData loctup;
- 
  				loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
  				loctup.t_len = ItemIdGetLength(lpp);
  				ItemPointerSet(&(loctup.t_self), page, lineoff);
  
  				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
  			}
  			if (valid)
  				scan->rs_vistuples[ntup++] = lineoff;
  		}
--- 262,283 ----
  	{
  		if (ItemIdIsNormal(lpp))
  		{
+ 			HeapTupleData loctup;
  			bool		valid;
  
  			if (all_visible)
  				valid = true;
  			else
  			{
  				loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
  				loctup.t_len = ItemIdGetLength(lpp);
  				ItemPointerSet(&(loctup.t_self), page, lineoff);
  
  				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
  			}
+ 
+ 			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer);
+ 
  			if (valid)
  				scan->rs_vistuples[ntup++] = lineoff;
  		}
***************
*** 468,479 **** heapgettup(HeapScanDesc scan,
--- 471,485 ----
  													 snapshot,
  													 scan->rs_cbuf);
  
+ 				CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf);
+ 
  				if (valid && key != NULL)
  					HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
  								nkeys, key, valid);
  
  				if (valid)
  				{
+ 					PredicateLockTuple(scan->rs_rd, tuple);
  					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  					return;
  				}
***************
*** 741,752 **** heapgettup_pagemode(HeapScanDesc scan,
--- 747,760 ----
  							nkeys, key, valid);
  				if (valid)
  				{
+ 					PredicateLockTuple(scan->rs_rd, tuple);
  					scan->rs_cindex = lineindex;
  					return;
  				}
  			}
  			else
  			{
+ 				PredicateLockTuple(scan->rs_rd, tuple);
  				scan->rs_cindex = lineindex;
  				return;
  			}
***************
*** 1460,1467 **** heap_fetch(Relation relation,
--- 1468,1478 ----
  
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
+ 	CheckForSerializableConflictOut(valid, relation, tuple, buffer);
+ 
  	if (valid)
  	{
+ 		PredicateLockTuple(relation, tuple);
  		/*
  		 * All checks passed, so return the tuple as valid. Caller is now
  		 * responsible for releasing the buffer.
***************
*** 1505,1517 **** heap_fetch(Relation relation,
   * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
   */
  bool
! heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
! 					   bool *all_dead)
  {
  	Page		dp = (Page) BufferGetPage(buffer);
  	TransactionId prev_xmax = InvalidTransactionId;
  	OffsetNumber offnum;
  	bool		at_chain_start;
  
  	if (all_dead)
  		*all_dead = true;
--- 1516,1530 ----
   * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
   */
  bool
! heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
! 					   Snapshot snapshot, bool *all_dead)
  {
  	Page		dp = (Page) BufferGetPage(buffer);
  	TransactionId prev_xmax = InvalidTransactionId;
  	OffsetNumber offnum;
  	bool		at_chain_start;
+ 	bool		valid;
+ 	bool		match_found;
  
  	if (all_dead)
  		*all_dead = true;
***************
*** 1521,1526 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
--- 1534,1540 ----
  	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
  	offnum = ItemPointerGetOffsetNumber(tid);
  	at_chain_start = true;
+ 	match_found = false;
  
  	/* Scan through possible multiple members of HOT-chain */
  	for (;;)
***************
*** 1551,1556 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
--- 1565,1572 ----
  
  		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
  		heapTuple.t_len = ItemIdGetLength(lp);
+ 		heapTuple.t_tableOid = relation->rd_id;
+ 		heapTuple.t_self = *tid;
  
  		/*
  		 * Shouldn't see a HEAP_ONLY tuple at chain start.
***************
*** 1568,1579 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
  			break;
  
  		/* If it's visible per the snapshot, we must return it */
! 		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
  		{
  			ItemPointerSetOffsetNumber(tid, offnum);
  			if (all_dead)
  				*all_dead = false;
! 			return true;
  		}
  
  		/*
--- 1584,1601 ----
  			break;
  
  		/* If it's visible per the snapshot, we must return it */
! 		valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer);
! 		CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer);
! 		if (valid)
  		{
  			ItemPointerSetOffsetNumber(tid, offnum);
+ 			PredicateLockTuple(relation, &heapTuple);
  			if (all_dead)
  				*all_dead = false;
! 			if (IsXactIsoLevelFullySerializable)
! 				match_found = true;
! 			else
! 				return true;
  		}
  
  		/*
***************
*** 1602,1608 **** heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
  			break;				/* end of chain */
  	}
  
! 	return false;
  }
  
  /*
--- 1624,1630 ----
  			break;				/* end of chain */
  	}
  
! 	return match_found;
  }
  
  /*
***************
*** 1621,1627 **** heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
  
  	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
  	LockBuffer(buffer, BUFFER_LOCK_SHARE);
! 	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	ReleaseBuffer(buffer);
  	return result;
--- 1643,1649 ----
  
  	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
  	LockBuffer(buffer, BUFFER_LOCK_SHARE);
! 	result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	ReleaseBuffer(buffer);
  	return result;
***************
*** 1728,1735 **** heap_get_latest_tid(Relation relation,
--- 1750,1760 ----
  		 * result candidate.
  		 */
  		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
+ 		CheckForSerializableConflictOut(valid, relation, &tp, buffer);
  		if (valid)
+ 		{
  			*tid = ctid;
+ 		}
  
  		/*
  		 * If there's a valid t_ctid link, follow it, else we're done.
***************
*** 1892,1897 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid,
--- 1917,1929 ----
  	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
  									   InvalidBuffer, options, bistate);
  
+ 	/*
+ 	 * We're about to do the actual insert -- check for conflict at the
+ 	 * relation or buffer level first, to avoid possibly having to roll
+ 	 * back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, NULL, buffer);
+ 
  	/* NO EREPORT(ERROR) from here till changes are logged */
  	START_CRIT_SECTION();
  
***************
*** 2192,2197 **** l1:
--- 2224,2235 ----
  		return result;
  	}
  
+ 	/*
+ 	 * We're about to do the actual delete -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, &tp, buffer);
+ 
  	/* replace cid with a combo cid if necessary */
  	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
  
***************
*** 2545,2550 **** l2:
--- 2583,2594 ----
  		return result;
  	}
  
+ 	/*
+ 	 * We're about to do the actual update -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, &oldtup, buffer);
+ 
  	/* Fill in OID and transaction status data for newtup */
  	if (relation->rd_rel->relhasoids)
  	{
***************
*** 2690,2695 **** l2:
--- 2734,2749 ----
  	}
  
  	/*
+ 	 * We're about to create the new tuple -- check for conflict first,
+ 	 * to avoid possibly having to roll back work we've just done.
+ 	 *
+ 	 * NOTE: For a tuple insert, we only need to check for table locks, since
+ 	 * predicate locking at the index level will cover ranges for anything
+ 	 * except a table scan.  Therefore, only provide the relation.
+ 	 */
+ 	CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
+ 
+ 	/*
  	 * At this point newbuf and buffer are both pinned and locked, and newbuf
  	 * has enough space for the new tuple.	If they are the same buffer, only
  	 * one pin is held.
***************
*** 2829,2834 **** l2:
--- 2883,2894 ----
  	CacheInvalidateHeapTuple(relation, heaptup);
  
  	/*
+ 	 * TODO SSI: In order to support SIREAD locks at tuple granularity, any
+ 	 *           existing SIREAD locks on the old tuple must be copied to
+ 	 *           also refer to the new tuple, somewhere around this point?
+ 	 */
+ 
+ 	/*
  	 * Release the lmgr tuple lock, if we had it.
  	 */
  	if (have_tuple_lock)
*** a/src/backend/access/index/indexam.c
--- b/src/backend/access/index/indexam.c
***************
*** 64,72 ****
--- 64,74 ----
  
  #include "access/relscan.h"
  #include "access/transam.h"
+ #include "access/xact.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "utils/relcache.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
***************
*** 192,197 **** index_insert(Relation indexRelation,
--- 194,204 ----
  	RELATION_CHECKS;
  	GET_REL_PROCEDURE(aminsert);
  
+ 	if (!(indexRelation->rd_am->ampredlocks))
+ 		CheckForSerializableConflictIn(indexRelation,
+ 									   (HeapTuple) NULL,
+ 									   InvalidBuffer);
+ 
  	/*
  	 * have the am's insert proc do all the work.
  	 */
***************
*** 266,271 **** index_beginscan_internal(Relation indexRelation,
--- 273,281 ----
  	RELATION_CHECKS;
  	GET_REL_PROCEDURE(ambeginscan);
  
+ 	if (!(indexRelation->rd_am->ampredlocks))
+ 		PredicateLockRelation(indexRelation);
+ 
  	/*
  	 * We hold a reference count to the relcache entry throughout the scan.
  	 */
***************
*** 515,520 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
--- 525,531 ----
  		{
  			ItemId		lp;
  			ItemPointer ctid;
+ 			bool		valid;
  
  			/* check for bogus TID */
  			if (offnum < FirstOffsetNumber ||
***************
*** 569,576 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				break;
  
  			/* If it's visible per the snapshot, we must return it */
! 			if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
! 											 scan->xs_cbuf))
  			{
  				/*
  				 * If the snapshot is MVCC, we know that it could accept at
--- 580,592 ----
  				break;
  
  			/* If it's visible per the snapshot, we must return it */
! 			valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
! 												 scan->xs_cbuf);
! 
! 			CheckForSerializableConflictOut(valid, scan->heapRelation,
! 											heapTuple, scan->xs_cbuf);
! 
! 			if (valid)
  			{
  				/*
  				 * If the snapshot is MVCC, we know that it could accept at
***************
*** 578,584 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot))
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
--- 594,601 ----
  				 * any more members.  Otherwise, check for continuation of the
  				 * HOT-chain, and set state for next time.
  				 */
! 				if (IsMVCCSnapshot(scan->xs_snapshot)
! 					&& !IsXactIsoLevelFullySerializable)
  					scan->xs_next_hot = InvalidOffsetNumber;
  				else if (HeapTupleIsHotUpdated(heapTuple))
  				{
***************
*** 594,599 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
--- 611,618 ----
  
  				pgstat_count_heap_fetch(scan->indexRelation);
  
+ 				PredicateLockTuple(scan->heapRelation, heapTuple);
+ 
  				return heapTuple;
  			}
  
*** a/src/backend/access/nbtree/nbtinsert.c
--- b/src/backend/access/nbtree/nbtinsert.c
***************
*** 21,26 ****
--- 21,27 ----
  #include "miscadmin.h"
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "utils/inval.h"
  #include "utils/tqual.h"
  
***************
*** 175,180 **** top:
--- 176,189 ----
  
  	if (checkUnique != UNIQUE_CHECK_EXISTING)
  	{
+ 		/*
+ 		 * The only conflict predicate locking cares about for indexes is when
+ 		 * an index tuple insert conflicts with an existing lock.  Since the
+ 		 * actual location of the insert is hard to predict because of the
+ 		 * random search used to prevent O(N^2) performance when there are many
+ 		 * duplicate entries, we can just use the "first valid" page.
+ 		 */
+ 		CheckForSerializableConflictIn(rel, NULL, buf);
  		/* do the insertion */
  		_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
  		_bt_insertonpg(rel, buf, stack, itup, offset, false);
***************
*** 697,702 **** _bt_insertonpg(Relation rel,
--- 706,714 ----
  		/* split the buffer into left and right halves */
  		rbuf = _bt_split(rel, buf, firstright,
  						 newitemoff, itemsz, itup, newitemonleft);
+ 		PredicateLockPageSplit(rel,
+ 							   BufferGetBlockNumber(buf),
+ 							   BufferGetBlockNumber(rbuf));
  
  		/*----------
  		 * By here,
*** a/src/backend/access/nbtree/nbtpage.c
--- b/src/backend/access/nbtree/nbtpage.c
***************
*** 1177,1182 **** _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
--- 1177,1188 ----
  	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
  
  	/*
+ 	 * Any insert which would have gone on the target block will now go to the
+ 	 * right sibling block.
+ 	 */
+ 	PredicateLockPageCombine(rel, target, rightsib);
+ 
+ 	/*
  	 * Next find and write-lock the current parent of the target page. This is
  	 * essentially the same as the corresponding step of splitting.
  	 */
*** a/src/backend/access/nbtree/nbtsearch.c
--- b/src/backend/access/nbtree/nbtsearch.c
***************
*** 21,26 ****
--- 21,27 ----
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/lsyscache.h"
  #include "utils/rel.h"
  
***************
*** 63,69 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
--- 64,73 ----
  
  	/* If index is empty and access = BT_READ, no root page is created. */
  	if (!BufferIsValid(*bufP))
+ 	{
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		return (BTStack) NULL;
+ 	}
  
  	/* Loop iterates once per level descended in the tree */
  	for (;;)
***************
*** 88,94 **** _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
--- 92,102 ----
  		page = BufferGetPage(*bufP);
  		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  		if (P_ISLEAF(opaque))
+ 		{
+ 			if (access == BT_READ)
+ 				PredicateLockPage(rel, BufferGetBlockNumber(*bufP));
  			break;
+ 		}
  
  		/*
  		 * Find the appropriate item on the internal page, and get the child
***************
*** 199,204 **** _bt_moveright(Relation rel,
--- 207,213 ----
  		elog(ERROR, "fell off the end of index \"%s\"",
  			 RelationGetRelationName(rel));
  
+ 	PredicateLockPage(rel, BufferGetBlockNumber(buf));
  	return buf;
  }
  
***************
*** 1142,1147 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir)
--- 1151,1157 ----
  			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  			if (!P_IGNORE(opaque))
  			{
+ 				PredicateLockPage(rel, blkno);
  				/* see if there are any matches on this page */
  				/* note that this will clear moreRight if we can stop */
  				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
***************
*** 1189,1194 **** _bt_steppage(IndexScanDesc scan, ScanDirection dir)
--- 1199,1205 ----
  			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  			if (!P_IGNORE(opaque))
  			{
+ 				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf));
  				/* see if there are any matches on this page */
  				/* note that this will clear moreLeft if we can stop */
  				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
***************
*** 1352,1357 **** _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
--- 1363,1369 ----
  	if (!BufferIsValid(buf))
  	{
  		/* empty index... */
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		return InvalidBuffer;
  	}
  
***************
*** 1431,1440 **** _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
--- 1443,1454 ----
  	if (!BufferIsValid(buf))
  	{
  		/* empty index... */
+ 		PredicateLockRelation(rel);  /* Nothing finer to lock exists. */
  		so->currPos.buf = InvalidBuffer;
  		return false;
  	}
  
+ 	PredicateLockPage(rel, BufferGetBlockNumber(buf));
  	page = BufferGetPage(buf);
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	Assert(P_ISLEAF(opaque));
*** a/src/backend/access/transam/xact.c
--- b/src/backend/access/transam/xact.c
***************
*** 39,44 ****
--- 39,45 ----
  #include "storage/bufmgr.h"
  #include "storage/fd.h"
  #include "storage/lmgr.h"
+ #include "storage/predicate.h"
  #include "storage/procarray.h"
  #include "storage/sinvaladt.h"
  #include "storage/smgr.h"
***************
*** 1754,1759 **** CommitTransaction(void)
--- 1755,1767 ----
  	AtEOXact_LargeObject(true);
  
  	/*
+ 	 * Mark serializable transaction as complete for predicate locking
+ 	 * purposes.  This should be done as late as we can put it and still
+ 	 * allow errors to be raised for failure patterns found at commit.
+ 	 */
+ 	PreCommit_CheckForSerializationFailure();
+ 
+ 	/*
  	 * Insert notifications sent by NOTIFY commands into the queue.  This
  	 * should be late in the pre-commit sequence to minimize time spent
  	 * holding the notify-insertion lock.
*** a/src/backend/catalog/index.c
--- b/src/backend/catalog/index.c
***************
*** 2044,2050 **** IndexCheckExclusion(Relation heapRelation,
   *
   * After completing validate_index(), we wait until all transactions that
   * were alive at the time of the reference snapshot are gone; this is
!  * necessary to be sure there are none left with a serializable snapshot
   * older than the reference (and hence possibly able to see tuples we did
   * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
   * transactions will be able to use it for queries.
--- 2044,2050 ----
   *
   * After completing validate_index(), we wait until all transactions that
   * were alive at the time of the reference snapshot are gone; this is
!  * necessary to be sure there are none left with a transaction-based snapshot
   * older than the reference (and hence possibly able to see tuples we did
   * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
   * transactions will be able to use it for queries.
*** a/src/backend/commands/trigger.c
--- b/src/backend/commands/trigger.c
***************
*** 2360,2366 **** ltrmark:;
  
  			case HeapTupleUpdated:
  				ReleaseBuffer(buffer);
! 				if (IsXactIsoLevelSerializable)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
--- 2360,2366 ----
  
  			case HeapTupleUpdated:
  				ReleaseBuffer(buffer);
! 				if (IsXactIsoLevelXactSnapshotBased)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/execMain.c
--- b/src/backend/executor/execMain.c
***************
*** 1544,1550 **** EvalPlanQualFetch(EState *estate, Relation relation, int lockmode,
  
  				case HeapTupleUpdated:
  					ReleaseBuffer(buffer);
! 					if (IsXactIsoLevelSerializable)
  						ereport(ERROR,
  								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  								 errmsg("could not serialize access due to concurrent update")));
--- 1544,1550 ----
  
  				case HeapTupleUpdated:
  					ReleaseBuffer(buffer);
! 					if (IsXactIsoLevelXactSnapshotBased)
  						ereport(ERROR,
  								(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  								 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeBitmapHeapscan.c
--- b/src/backend/executor/nodeBitmapHeapscan.c
***************
*** 42,47 ****
--- 42,48 ----
  #include "executor/nodeBitmapHeapscan.h"
  #include "pgstat.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/memutils.h"
  #include "utils/snapmgr.h"
  #include "utils/tqual.h"
***************
*** 351,357 **** bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
  			ItemPointerData tid;
  
  			ItemPointerSet(&tid, page, offnum);
! 			if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL))
  				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
  		}
  	}
--- 352,358 ----
  			ItemPointerData tid;
  
  			ItemPointerSet(&tid, page, offnum);
! 			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL))
  				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
  		}
  	}
*** a/src/backend/executor/nodeIndexscan.c
--- b/src/backend/executor/nodeIndexscan.c
***************
*** 30,35 ****
--- 30,36 ----
  #include "executor/execdebug.h"
  #include "executor/nodeIndexscan.h"
  #include "optimizer/clauses.h"
+ #include "storage/predicate.h"
  #include "utils/array.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
*** a/src/backend/executor/nodeLockRows.c
--- b/src/backend/executor/nodeLockRows.c
***************
*** 130,136 **** lnext:
  				break;
  
  			case HeapTupleUpdated:
! 				if (IsXactIsoLevelSerializable)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
--- 130,136 ----
  				break;
  
  			case HeapTupleUpdated:
! 				if (IsXactIsoLevelXactSnapshotBased)
  					ereport(ERROR,
  							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  							 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeModifyTable.c
--- b/src/backend/executor/nodeModifyTable.c
***************
*** 328,334 **** ldelete:;
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelSerializable)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
--- 328,334 ----
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelXactSnapshotBased)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
***************
*** 516,522 **** lreplace:;
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelSerializable)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
--- 516,522 ----
  			break;
  
  		case HeapTupleUpdated:
! 			if (IsXactIsoLevelXactSnapshotBased)
  				ereport(ERROR,
  						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
  						 errmsg("could not serialize access due to concurrent update")));
*** a/src/backend/executor/nodeSeqscan.c
--- b/src/backend/executor/nodeSeqscan.c
***************
*** 28,33 ****
--- 28,34 ----
  #include "access/relscan.h"
  #include "executor/execdebug.h"
  #include "executor/nodeSeqscan.h"
+ #include "storage/predicate.h"
  
  static void InitScanRelation(SeqScanState *node, EState *estate);
  static TupleTableSlot *SeqNext(SeqScanState *node);
***************
*** 105,115 **** SeqRecheck(SeqScanState *node, TupleTableSlot *slot)
--- 106,118 ----
   *		tuple.
   *		We call the ExecScan() routine and pass it the appropriate
   *		access method functions.
+  *		For serializable transactions, we first lock the entire relation.
   * ----------------------------------------------------------------
   */
  TupleTableSlot *
  ExecSeqScan(SeqScanState *node)
  {
+ 	PredicateLockRelation(node->ss_currentRelation);
  	return ExecScan((ScanState *) node,
  					(ExecScanAccessMtd) SeqNext,
  					(ExecScanRecheckMtd) SeqRecheck);
*** a/src/backend/executor/nodeTidscan.c
--- b/src/backend/executor/nodeTidscan.c
***************
*** 31,36 ****
--- 31,37 ----
  #include "executor/nodeTidscan.h"
  #include "optimizer/clauses.h"
  #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
  #include "utils/array.h"
  
  
*** a/src/backend/storage/ipc/ipci.c
--- b/src/backend/storage/ipc/ipci.c
***************
*** 105,110 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
--- 105,111 ----
  												 sizeof(ShmemIndexEnt)));
  		size = add_size(size, BufferShmemSize());
  		size = add_size(size, LockShmemSize());
+ 		size = add_size(size, PredicateLockShmemSize());
  		size = add_size(size, ProcGlobalShmemSize());
  		size = add_size(size, XLOGShmemSize());
  		size = add_size(size, CLOGShmemSize());
***************
*** 200,205 **** CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
--- 201,211 ----
  	InitLocks();
  
  	/*
+ 	 * Set up predicate lock manager
+ 	 */
+ 	InitPredicateLocks();
+ 
+ 	/*
  	 * Set up process table
  	 */
  	if (!IsUnderPostmaster)
*** a/src/backend/storage/ipc/shmqueue.c
--- b/src/backend/storage/ipc/shmqueue.c
***************
*** 43,56 **** SHMQueueInit(SHM_QUEUE *queue)
   * SHMQueueIsDetached -- TRUE if element is not currently
   *		in a queue.
   */
- #ifdef NOT_USED
  bool
  SHMQueueIsDetached(SHM_QUEUE *queue)
  {
  	Assert(ShmemAddrIsValid(queue));
  	return (queue->prev == NULL);
  }
- #endif
  
  /*
   * SHMQueueElemInit -- clear an element's links
--- 43,54 ----
*** a/src/backend/storage/lmgr/Makefile
--- b/src/backend/storage/lmgr/Makefile
***************
*** 12,18 **** subdir = src/backend/storage/lmgr
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 12,18 ----
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
! OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o predicate.o
  
  include $(top_srcdir)/src/backend/common.mk
  
*** /dev/null
--- b/src/backend/storage/lmgr/predicate.c
***************
*** 0 ****
--- 1,2393 ----
+ /*-------------------------------------------------------------------------
+  *
+  * predicate.c
+  *	  POSTGRES predicate locking
+  *	  to support full serializable transaction isolation
+  *
+  * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+  * locks, which are so different from normal locks that a distinct set of
+  * structures is required to handle them.
+  *
+  * (1)	Besides tuples actually read, they must cover ranges of tuples
+  *		which would have been read based on the predicate.	This will
+  *		require modelling the predicates through locks against database
+  *		objects such as pages, index ranges, or entire tables.
+  *
+  * (2)	They must be kept in RAM for quick access.	Because of this, it
+  *		isn't possible to always maintain tuple-level granularity -- when
+  *		the space allocated to store these approaches exhaustion, a
+  *		request for a lock may need to scan for situations where a single
+  *		transaction holds many fine-grained locks which can be coalesced
+  *		into a single coarser-grained lock.
+  *
+  * (3)	They never block anything; they are more like flags than locks
+  *		in that regard; although they refer to database objects and are
+  *		used to identify rw-conflicts with normal write locks.
+  *
+  * (4)	While they are associated with a transaction, they must survive
+  *		a successful COMMIT of that transaction, and remain until all
+  *		overlapping transactions complete.	This even means that they
+  *		must survive termination of the transaction's process.  On a
+  *		rollback of the top level transaction, all of that transaction's
+  *		SIREAD locks should be released, however.
+  *
+  * (5)	The only transactions which create SIREAD locks or check for
+  *		conflicts with them are serializable transactions.
+  *
+  * (6)	When a write lock for a top level transaction is found to cover
+  *		an existing SIREAD lock for the same transaction, the SIREAD lock
+  *		can be deleted.
+  *
+  * (7)	A write from a serializable transaction must ensure that a xact
+  *		record exists for the transaction, with the same lifespan (until
+  *		all concurrent transaction complete or the transaction is rolled
+  *		back) so that rw-dependencies to that transaction can be
+  *		detected.
+  *
+  *
+  * Lightweight locks to manage access to the predicate locking shared
+  * memory objects must be taken in this order, and should be released in
+  * reverse order:
+  *
+  *	SerializableFinishedListLock
+  *		- Protects the list of transaction which have completed but which
+  *			may yet matter because they overlap still-active transactions.
+  *
+  *	SerializablePredicateLockListLock
+  *		- Special handling: use shared mode for walking the list *and*
+  *			for modifying the list from the process running the owning
+  *			transaction.  No other process is allowed to walk the list,
+  *			and any other process must acquire exclusive access to modify
+  *			it.  Once a transaction has completed, it is the holder of
+  *			the SerializableFinishedListLock who can walk the list in
+  *			shared mode.
+  *
+  *	FirstPredicateLockMgrLock based partition locks
+  *		- The same lock protects a target and all locks on that target.
+  *		- When more than one is needed, acquire in ascending order.
+  *
+  *	SerializableXactHashLock
+  *		- Protects both SerializableXactHash and SerializableXidHash.
+  *
+  *
+  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *
+  * IDENTIFICATION
+  *	  $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ /*
+  * INTERFACE ROUTINES
+  *
+  * housekeeping for setting up shared memory predicate lock structures
+  *		InitPredicateLocks(void)
+  *		PredicateLockShmemSize(void)
+  *
+  * predicate lock reporting
+  *		PredicateLockData *GetPredicateLockStatusData(void)
+  *
+  * predicate lock maintenance
+  *		RegisterSerializableTransaction(Snapshot snapshot)
+  *		PredicateLockRelation(Relation relation)
+  *		PredicateLockPage(Relation relation, BlockNumber blkno)
+  *		PredicateLockTuple(Relation relation, HeapTuple tuple)
+  *		PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+  *							   BlockNumber newblkno);
+  *		PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+  *								 BlockNumber newblkno);
+  *		ReleasePredicateLocks(bool isCommit)
+  *
+  * conflict detection (may also trigger rollback)
+  *		CheckForSerializableConflictOut(bool valid, Relation relation,
+  *										HeapTupleData *tup, Buffer buffer)
+  *		CheckForSerializableConflictIn(Relation relation, HeapTupleData *tup,
+  *									   Buffer buffer)
+  *
+  * final rollback checking
+  *		PreCommit_CheckForSerializationFailure(void)
+  */
+ 
+ #include "postgres.h"
+ 
+ #include "access/transam.h"
+ #include "access/twophase.h"
+ #include "access/xact.h"
+ #include "miscadmin.h"
+ #include "storage/bufmgr.h"
+ #include "storage/predicate.h"
+ #include "utils/rel.h"
+ #include "utils/snapmgr.h"
+ 
+ /*
+  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
+  * transaction or any of its subtransactions.
+  */
+ typedef struct SERIALIZABLEXIDTAG
+ {
+ 	TransactionId xid;
+ } SERIALIZABLEXIDTAG;
+ 
+ /*
+  * Information to link between an xid list and a top level serializable transaction.
+  */
+ typedef struct SERIALIZABLEXID
+ {
+ 	/* hash key */
+ 	SERIALIZABLEXIDTAG tag;
+ 
+ 	/* data */
+ 	SERIALIZABLEXACT *myXact;	/* pointer to the top level transaction data */
+ 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+ 								 * xids */
+ } SERIALIZABLEXID;
+ 
+ /*
+  * Per-locked-object predicate lock information:
+  *
+  * tag -- uniquely identifies the object being locked
+  * predicateLocks -- list of predicate lock objects for this target.
+  */
+ typedef struct PREDICATELOCKTARGET
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	SHM_QUEUE	predicateLocks; /* list of PREDICATELOCK objects assoc. with
+ 								 * predicate lock target */
+ } PREDICATELOCKTARGET;
+ 
+ typedef struct PREDICATELOCKTAG
+ {
+ 	PREDICATELOCKTARGET *myTarget;
+ 	SERIALIZABLEXACT *myXact;
+ } PREDICATELOCKTAG;
+ 
+ typedef struct PREDICATELOCK
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTAG tag;		/* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	SHM_QUEUE	targetLink;		/* list link in PREDICATELOCKTARGET's list of
+ 								 * predicate locks */
+ 	SHM_QUEUE	xactLink;		/* list link in SERIALIZABLEXACT's list of
+ 								 * predicate locks */
+ } PREDICATELOCK;
+ 
+ /*
+  * Backend-local hash table of ancestor (coarser) locks and the number
+  * of (finer-grained) children locks that are currently held. This is
+  * used to determine when to promote multiple fine-grained locks to
+  * one coarse-grained lock.
+  */
+ typedef struct LOCALPREDICATELOCK
+ {
+ 	/* hash key */
+ 	PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+ 
+ 	/* data */
+ 	bool		held;			/* is lock held, or just its children?	*/
+ 	int			childLocks;		/* number of child locks currently held */
+ } LOCALPREDICATELOCK;
+ static HTAB *LocalPredicateLockHash = NULL;
+ 
+ 
+ /*
+  * Test the most selective fields first, for performance.
+  *
+  * a is covered by b if all of the following hold:
+  *	1) a.database = b.database
+  *	2) a.relation = b.relation
+  *	3) b.offset is invalid (b is page-granularity or higher)
+  *	4) either of the following:
+  *		4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+  *	 or 4b) a.offset is invalid and b.page is invalid (a is
+  *			page-granularity and b is relation-granularity
+  */
+ #define TargetTagIsCoveredBy(covered_target, covering_target)			\
+ 	((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */	\
+ 	  GET_PREDICATELOCKTARGETTAG_RELATION(covering_target))				\
+ 	 && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) ==			\
+ 		 InvalidOffsetNumber)								 /* (3) */	\
+ 	 && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) !=			\
+ 		   InvalidOffsetNumber)								 /* (4a) */ \
+ 		  && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+ 			  GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)))			\
+ 		 || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) ==		\
+ 			  InvalidBlockNumber)							 /* (4b) */ \
+ 			 && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target)		\
+ 				 != InvalidBlockNumber)))								\
+ 	 && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) ==	 /* (1) */	\
+ 		 GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+ 
+ /*
+  * The predicate locking target and lock shared hash tables are partitioned to
+  * reduce contention.  To determine which partition a given target belongs to,
+  * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+  * apply one of these macros.
+  * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+  */
+ #define PredicateLockHashPartition(hashcode) \
+ 	((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+ #define PredicateLockHashPartitionLock(hashcode) \
+ 	((LWLockId) (FirstPredicateLockMgrLock + PredicateLockHashPartition(hashcode)))
+ 
+ #define NPREDICATELOCKTARGETENTS() \
+ 	mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+ 
+ #define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink)))
+ 
+ #define SxactIsCommitted(sxact) TransactionIdIsValid((sxact)->finishedBefore)
+ #define SxactCommittedBefore(sxactPivotOut, sxactOther) \
+ 	((!TransactionIdIsValid((sxactOther)->finishedBefore)) \
+ 	|| TransactionIdPrecedesOrEquals((sxactPivotOut)->finishedBefore, (sxactOther)->finishedBefore))
+ 
+ /*
+  * When a public interface method is called for a split on an index relation,
+  * this is the test to see if we should do a quick return.
+  */
+ #define SkipSplitTracking(relation) \
+ 	(((relation)->rd_id < FirstBootstrapObjectId) \
+ 	|| ((relation)->rd_istemp))
+ 
+ /*
+  * When a public interface method is called for serializing a relation within
+  * the current transaction, this is the test to see if we should do a quick return.
+  */
+ #define SkipSerialization(relation) \
+ 	((!IsXactIsoLevelFullySerializable) \
+ 	|| SkipSplitTracking(relation))
+ 
+ 
+ /*
+  * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+  *
+  * To avoid unnecessary recomputations of the hash code, we try to do this
+  * just once per function, and then pass it around as needed.  Aside from
+  * passing the hashcode to hash_search_with_hash_value(), we can extract
+  * the lock partition number from the hashcode.
+  */
+ #define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+ 	(tag_hash((predicatelocktargettag), sizeof(PREDICATELOCKTARGETTAG)))
+ 
+ /*
+  * Given a predicate lock tag, and the hash for its target,
+  * compute the lock hash.
+  *
+  * To make the hash code also depend on the transaction, we xor the sxid
+  * struct's address into the hash code, left-shifted so that the
+  * partition-number bits don't change.  Since this is only a hash, we
+  * don't care if we lose high-order bits of the address; use an
+  * intermediate variable to suppress cast-pointer-to-int warnings.
+  */
+ #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+ 	((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+ 
+ 
+ /* This configuration variable is used to set the predicate lock table size */
+ int			max_predicate_locks_per_xact;		/* set by guc.c */
+ 
+ /*
+  * These global variables are maintained when registering and cleaning up
+  * serializable transactions.  They must be global across all backends, but
+  * are not needed outside this source file, so no .h declaration is needed.
+  */
+ TransactionId SerializableGlobalXmin = InvalidTransactionId;
+ int			SerializableGlobalXminCount = 0;
+ 
+ /*
+  * The predicate locking hash tables are in shared memory.
+  * Each backend keeps pointers to them.
+  */
+ static HTAB *SerializableXactHash;
+ static HTAB *SerializableXidHash;
+ static HTAB *PredicateLockTargetHash;
+ static HTAB *PredicateLockHash;
+ static SHM_QUEUE *FinishedSerializableTransactions;
+ 
+ /*
+  * Keep a pointer to the currently-running serializable transaction (if any)
+  * for quick reference.
+  */
+ typedef SERIALIZABLEXACT *SERIALIZABLEXACTPtr;
+ 
+ #define InvalidSerializableXact ((SERIALIZABLEXACTPtr) NULL)
+ static volatile SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+ 
+ /* TODO SSI: Remove volatile qualifier and the then-unnecessary casts? */
+ 
+ /* The most recently used xid within this transaction, for optimizations. */
+ static TransactionId MyXid = InvalidTransactionId;
+ 
+ 
+ /* local functions */
+ static uint32 predicatelock_hash(const void *key, Size keysize);
+ static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact);
+ static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *newtargettag);
+ static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+ static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *targettag);
+ static int	PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag);
+ static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ 						  PREDICATELOCKTARGETTAG *parent);
+ static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+ static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *tag);
+ static void EnsureMySerializableXidExists(void);
+ static void ClearOldPredicateLocks(void);
+ static bool XidIsConcurrent(TransactionId xid);
+ static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+ static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+ static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
+ 
+ /*
+  * InitPredicateLocks -- Initialize the predicate locking data structures.
+  *
+  * This is called from CreateSharedMemoryAndSemaphores(), which see for
+  * more comments.  In the normal postmaster case, the shared hash tables
+  * are created here.  Backends inherit the pointers
+  * to the shared tables via fork().  In the EXEC_BACKEND case, each
+  * backend re-executes this code to obtain pointers to the already existing
+  * shared hash tables.
+  */
+ void
+ InitPredicateLocks(void)
+ {
+ 	HASHCTL		info;
+ 	int			hash_flags;
+ 	long		init_table_size,
+ 				max_table_size;
+ 	bool		found;
+ 
+ 	/*
+ 	 * Compute init/max size to request for predicate lock target hashtable.
+ 	 * Note these calculations must agree with PredicateLockShmemSize!
+ 	 */
+ 	max_table_size = NPREDICATELOCKTARGETENTS();
+ 	init_table_size = max_table_size / 2;
+ 
+ 	/*
+ 	 * Allocate hash table for PREDICATELOCKTARGET structs.  This stores
+ 	 * per-predicate-lock-target information.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ 	info.entrysize = sizeof(PREDICATELOCKTARGET);
+ 	info.hash = tag_hash;
+ 	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ 
+ 	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+ 											init_table_size,
+ 											max_table_size,
+ 											&info,
+ 											hash_flags);
+ 
+ 	/* Assume an average of 2 xacts per target */
+ 	max_table_size *= 2;
+ 	init_table_size *= 2;
+ 
+ 	/*
+ 	 * Allocate hash table for PREDICATELOCK structs.  This stores per
+ 	 * xact-lock-of-a-target information.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(PREDICATELOCKTAG);
+ 	info.entrysize = sizeof(PREDICATELOCK);
+ 	info.hash = predicatelock_hash;
+ 	info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ 
+ 	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+ 									  init_table_size,
+ 									  max_table_size,
+ 									  &info,
+ 									  hash_flags);
+ 
+ 	/*
+ 	 * Compute init/max size to request for serializable transaction
+ 	 * hashtable. Note these calculations must agree with
+ 	 * PredicateLockShmemSize!
+ 	 */
+ 	max_table_size = MaxBackends;
+ 	init_table_size = max_table_size / 2;
+ 
+ 	/*
+ 	 * Allocate hash table for SERIALIZABLEXACT structs.  This stores per-vxid
+ 	 * information for serializable transactions which have accessed data.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(SERIALIZABLEXACTTAG);
+ 	info.entrysize = sizeof(SERIALIZABLEXACT);
+ 	info.hash = tag_hash;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION);
+ 
+ 	SerializableXactHash = ShmemInitHash("SERIALIZABLEXACT hash",
+ 										 init_table_size,
+ 										 max_table_size,
+ 										 &info,
+ 										 hash_flags);
+ 
+ 	/* Assume an average of 10 serializable xids per backend. */
+ 	max_table_size *= 10;
+ 	init_table_size *= 10;
+ 
+ 	/*
+ 	 * Allocate hash table for SERIALIZABLEXID structs.  This stores per-xid
+ 	 * information for serializable transactions which have accessed data.
+ 	 */
+ 	MemSet(&info, 0, sizeof(info));
+ 	info.keysize = sizeof(SERIALIZABLEXIDTAG);
+ 	info.entrysize = sizeof(SERIALIZABLEXID);
+ 	info.hash = tag_hash;
+ 	hash_flags = (HASH_ELEM | HASH_FUNCTION);
+ 
+ 	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+ 										init_table_size,
+ 										max_table_size,
+ 										&info,
+ 										hash_flags);
+ 
+ 	/*
+ 	 * Create or attach to the header for the list of finished serializable
+ 	 * transactions.
+ 	 */
+ 	FinishedSerializableTransactions = (SHM_QUEUE *)
+ 		ShmemInitStruct("FinishedSerializableTransactions",
+ 						sizeof(SHM_QUEUE),
+ 						&found);
+ 	if (!found)
+ 		SHMQueueInit(FinishedSerializableTransactions);
+ }
+ 
+ /*
+  * Estimate shared-memory space used for predicate lock table
+  */
+ Size
+ PredicateLockShmemSize(void)
+ {
+ 	Size		size = 0;
+ 	long		max_table_size;
+ 
+ 	/* predicate lock target hash table */
+ 	max_table_size = NPREDICATELOCKTARGETENTS();
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(PREDICATELOCKTARGET)));
+ 
+ 	/* predicate lock hash table */
+ 	max_table_size *= 2;
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(PREDICATELOCK)));
+ 
+ 	/*
+ 	 * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+ 	 * margin.
+ 	 */
+ 	size = add_size(size, size / 10);
+ 
+ 	/* serializable transaction table */
+ 	max_table_size = MaxBackends;
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(SERIALIZABLEXACT)));
+ 
+ 	/* serializable subtransaction table */
+ 	max_table_size *= 10;
+ 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(SERIALIZABLEXID)));
+ 
+ 	/* Head for list of serializable transactions. */
+ 	size = add_size(size, sizeof(SHM_QUEUE));
+ 
+ 	return size;
+ }
+ 
+ 
+ /*
+  * Compute the hash code associated with a PREDICATELOCKTAG.
+  *
+  * Because we want to use just one set of partition locks for both the
+  * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+  * that PREDICATELOCKs fall into the same partition number as their
+  * associated PREDICATELOCKTARGETs.  dynahash.c expects the partition number
+  * to be the low-order bits of the hash code, and therefore a
+  * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+  * associated PREDICATELOCKTARGETTAG's hash code.  We achieve this with this
+  * specialized hash function.
+  */
+ static uint32
+ predicatelock_hash(const void *key, Size keysize)
+ {
+ 	const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+ 	uint32		targethash;
+ 
+ 	Assert(keysize == sizeof(PREDICATELOCKTAG));
+ 
+ 	/* Look into the associated target object, and compute its hash code */
+ 	targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+ 
+ 	return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+ }
+ 
+ 
+ /*
+  * GetPredicateLockStatusData
+  *		Return a table containing the internal state of the predicate
+  *		lock manager for use in pg_lock_status.
+  *
+  * Like GetLockStatusData, this function tries to hold the partition LWLocks
+  * for as short a time as possible by returning two arrays that simply
+  * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+  * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+  * SERIALIZABLEXACT will likely appear.
+  */
+ PredicateLockData *
+ GetPredicateLockStatusData(void)
+ {
+ 	PredicateLockData *data;
+ 	int			i;
+ 	int			els,
+ 				el;
+ 	HASH_SEQ_STATUS seqstat;
+ 	PREDICATELOCK *predlock;
+ 
+ 	data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+ 
+ 	/*
+ 	 * Acquire locks. To ensure consistency, take simultaneous locks on
+ 	 * SerializableFinishedListLock, all partition locks in ascending order,
+ 	 * then SerializableXactHashLock. TODO SSI: Do we really need to lock
+ 	 * SerializableFinishedListLock?
+ 	 */
+ 	LWLockAcquire(SerializableFinishedListLock, LW_SHARED);
+ 	for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ 		LWLockAcquire(FirstPredicateLockMgrLock + i, LW_SHARED);
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 
+ 	/* Get number of locks and allocate appropriately-sized arrays. */
+ 	els = hash_get_num_entries(PredicateLockHash);
+ 	data->nelements = els;
+ 	data->locktags = (PREDICATELOCKTARGETTAG *)
+ 		palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+ 	data->xacts = (SERIALIZABLEXACT *)
+ 		palloc(sizeof(SERIALIZABLEXACT) * els);
+ 
+ 
+ 	/* Scan through PredicateLockHash and copy contents */
+ 	hash_seq_init(&seqstat, PredicateLockHash);
+ 
+ 	el = 0;
+ 
+ 	while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+ 	{
+ 		data->locktags[el] = predlock->tag.myTarget->tag;
+ 		data->xacts[el] = *predlock->tag.myXact;
+ 		el++;
+ 	}
+ 
+ 	Assert(el == els);
+ 
+ 	/* Release locks in reverse order */
+ 	LWLockRelease(SerializableXactHashLock);
+ 	for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ 		LWLockRelease(FirstPredicateLockMgrLock + i);
+ 	LWLockRelease(SerializableFinishedListLock);
+ 
+ 	return data;
+ }
+ 
+ 
+ /*
+  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+  * It should be current for this process and be contained in
+  * SerializableXactHash.
+  */
+ void
+ RegisterSerializableTransaction(const Snapshot snapshot)
+ {
+ 	PGPROC	   *proc;
+ 	SERIALIZABLEXACTTAG sxacttag;
+ 	SERIALIZABLEXACT *sxact;
+ 	bool		found;
+ 	HASHCTL		hash_ctl;
+ 
+ 	/* We only do this for serializable transactions.  Once. */
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 	Assert(MySerializableXact == InvalidSerializableXact);
+ 
+ 	proc = MyProc;
+ 	Assert(proc != NULL);
+ 	GET_VXID_FROM_PGPROC(sxacttag.vxid, *proc);
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	if (!TransactionIdIsValid(SerializableGlobalXmin))
+ 	{
+ 		Assert(SerializableGlobalXminCount == 0);
+ 		SerializableGlobalXmin = snapshot->xmin;
+ 		SerializableGlobalXminCount = 1;
+ 	}
+ 	else if (SerializableGlobalXmin == snapshot->xmin)
+ 	{
+ 		Assert(SerializableGlobalXminCount > 0);
+ 		SerializableGlobalXminCount++;
+ 	}
+ 	else
+ 	{
+ 		Assert(TransactionIdFollows(snapshot->xmin, SerializableGlobalXmin));
+ 	}
+ 	sxact = (SERIALIZABLEXACT *) hash_search(SerializableXactHash,
+ 											 &sxacttag,
+ 											 HASH_ENTER, &found);
+ 	Assert(!found);
+ 	if (!sxact)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 	/* Initialize the structure. */
+ 	sxact->outConflict = InvalidSerializableXact;
+ 	sxact->inConflict = InvalidSerializableXact;
+ 	sxact->topXid = GetTopTransactionIdIfAny();
+ 	sxact->finishedBefore = InvalidTransactionId;
+ 	sxact->xmin = snapshot->xmin;
+ 	SHMQueueInit(&(sxact->predicateLocks));
+ 	SHMQueueInit(&(sxact->xids));
+ 	SHMQueueElemInit(&(sxact->finishedLink));
+ 	sxact->rolledBack = false;
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	MySerializableXact = sxact;
+ 
+ 	/* Initialized the backend-local hash table of parent locks */
+ 	Assert(LocalPredicateLockHash == NULL);
+ 	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ 	hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ 	hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+ 	hash_ctl.hash = tag_hash;
+ 	LocalPredicateLockHash = hash_create("Local predicate lock",
+ 										 max_predicate_locks_per_xact,
+ 										 &hash_ctl,
+ 										 HASH_ELEM | HASH_FUNCTION);
+ }
+ 
+ /*
+  * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+  * It should be current for this process and be contained in SerializableXidHash.
+  */
+ static void
+ EnsureMySerializableXidExists(void)
+ {
+ 	TransactionId xid;
+ 
+ 	Assert(MySerializableXact != InvalidSerializableXact);
+ 
+ 	MySerializableXact->topXid = GetTopTransactionIdIfAny();
+ 
+ 	/*
+ 	 * If this isn't the xid we've most recently seen for this vxid, make sure
+ 	 * it's in the hash table.
+ 	 */
+ 	xid = GetCurrentTransactionIdIfAny();
+ 	if (MyXid != xid)
+ 	{
+ 		SERIALIZABLEXIDTAG sxidtag;
+ 		SERIALIZABLEXID *sxid;
+ 		bool		found;
+ 
+ 		Assert(TransactionIdIsValid(xid));
+ 
+ 		sxidtag.xid = xid;
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ 											   &sxidtag,
+ 											   HASH_ENTER, &found);
+ 		if (!sxid)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of shared memory"),
+ 					 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 		/* Initialize the structure. */
+ 		if (!found)
+ 		{
+ 			sxid->myXact = (SERIALIZABLEXACT *) MySerializableXact;
+ 			SHMQueueInsertBefore(&(((SERIALIZABLEXACT *) MySerializableXact)->xids),
+ 								 &(sxid->xactLink));
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 		MyXid = xid;
+ 	}
+ }
+ 
+ 
+ /*
+  * Check whether a particular lock is held by this transaction.
+  */
+ static bool
+ PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	LOCALPREDICATELOCK *lock;
+ 
+ 	/* check local hash table */
+ 	lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ 											  targettag,
+ 											  HASH_FIND, NULL);
+ 
+ 	if (!lock)
+ 		return false;
+ 
+ 	/*
+ 	 * Found entry in the table, but still need to check whether it's actually
+ 	 * held -- it could just be a parent of some held lock.
+ 	 */
+ 	return lock->held;
+ }
+ 
+ /*
+  * Return the parent lock tag in the lock hierarchy: the next coarser
+  * lock that covers the provided tag.
+  *
+  * Returns true and sets *parent to the parent tag if one exists,
+  * returns false if none exists.
+  */
+ static bool
+ GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ 						  PREDICATELOCKTARGETTAG *parent)
+ {
+ 	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ 	{
+ 		case PREDLOCKTAG_RELATION:
+ 			/* relation locks have no parent lock */
+ 			return false;
+ 
+ 		case PREDLOCKTAG_PAGE:
+ 			/* parent lock is relation lock */
+ 			SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+ 										 GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ 								  GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+ 
+ 			return true;
+ 
+ 		case PREDLOCKTAG_TUPLE:
+ 			/* parent lock is page lock */
+ 			SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+ 										 GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ 								   GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+ 									  GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+ 			return true;
+ 	}
+ 
+ 	/* not reachable */
+ 	Assert(false);
+ 	return false;
+ }
+ 
+ /*
+  * Check whether the lock we are considering is already covered by a
+  * coarser lock for our transaction.
+  */
+ static bool
+ CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+ {
+ 	PREDICATELOCKTARGETTAG targettag,
+ 				parenttag;
+ 
+ 	targettag = *newtargettag;
+ 
+ 	/* check parents iteratively until no more */
+ 	while (GetParentPredicateLockTag(&targettag, &parenttag))
+ 	{
+ 		targettag = parenttag;
+ 		if (PredicateLockExists(&targettag))
+ 			return true;
+ 	}
+ 
+ 	/* no more parents to check; lock is not covered */
+ 	return false;
+ }
+ 
+ 
+ /*
+  * Delete child target locks owned by this process.
+  * This implementation is assuming that the usage of each target tag field
+  * is uniform.	No need to make this hard if we don't have to.
+  *
+  * We aren't acquiring lightweight locks for the predicate lock or lock
+  * target structures associated with this transaction unless we're going
+  * to modify them, because no other process is permitted to modify our
+  * locks.
+  */
+ static void
+ DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+ {
+ 	SERIALIZABLEXACT *sxact;
+ 	PREDICATELOCK *predlock;
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	sxact = (SERIALIZABLEXACT *) MySerializableXact;
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(sxact->predicateLocks),
+ 					 &(sxact->predicateLocks),
+ 					 offsetof(PREDICATELOCK, xactLink));
+ 	while (predlock)
+ 	{
+ 		SHM_QUEUE  *predlocksxactlink;
+ 		PREDICATELOCK *nextpredlock;
+ 		PREDICATELOCKTAG oldlocktag;
+ 		PREDICATELOCKTARGET *oldtarget;
+ 		PREDICATELOCKTARGETTAG oldtargettag;
+ 
+ 		predlocksxactlink = &(predlock->xactLink);
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(sxact->predicateLocks),
+ 						 predlocksxactlink,
+ 						 offsetof(PREDICATELOCK, xactLink));
+ 
+ 		oldlocktag = predlock->tag;
+ 		Assert(oldlocktag.myXact == sxact);
+ 		oldtarget = oldlocktag.myTarget;
+ 		oldtargettag = oldtarget->tag;
+ 
+ 		if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+ 		{
+ 			uint32		oldtargettaghash;
+ 			LWLockId	partitionLock;
+ 			PREDICATELOCK *rmpredlock;
+ 			PREDICATELOCKTARGET *rmtarget;
+ 
+ 			oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 			partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 
+ 			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 
+ 			SHMQueueDelete(predlocksxactlink);
+ 			SHMQueueDelete(&(predlock->targetLink));
+ 			rmpredlock = hash_search_with_hash_value(PredicateLockHash,
+ 													 &oldlocktag,
+ 													 PredicateLockHashCodeFromTargetHashCode(&oldlocktag, oldtargettaghash),
+ 													 HASH_REMOVE, NULL);
+ 			Assert(rmpredlock == predlock);
+ 
+ 			if (SHMQueueEmpty(&oldtarget->predicateLocks))
+ 			{
+ 				rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 													   &oldtargettag,
+ 													   oldtargettaghash,
+ 													   HASH_REMOVE, NULL);
+ 				Assert(rmtarget == oldtarget);
+ 			}
+ 
+ 			LWLockRelease(partitionLock);
+ 
+ 			DecrementParentLocks(&oldtargettag);
+ 		}
+ 
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  * Returns the promotion threshold for a given predicate lock
+  * target. This is the number of descendant locks required to promote
+  * to the specified tag. Note that the threshold includes non-direct
+  * descendants, e.g. both tuples and pages for a relation lock.
+  *
+  * TODO SSI: We should do something more intelligent about what the
+  * thresholds are, either making it proportional to the number of
+  * tuples in a page & pages in a relation, or at least making it a
+  * GUC. Currently the threshold is 3 for a page lock, and
+  * max_predicate_locks_per_transaction/2 for a relation lock, chosen
+  * entirely arbitrarily (and without benchmarking).
+  */
+ static int
+ PredicateLockPromotionThreshold(const PREDICATELOCKTARGETTAG *tag)
+ {
+ 	switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ 	{
+ 		case PREDLOCKTAG_RELATION:
+ 			return max_predicate_locks_per_xact / 2;
+ 
+ 		case PREDLOCKTAG_PAGE:
+ 			return 3;
+ 
+ 		case PREDLOCKTAG_TUPLE:
+ 
+ 			/*
+ 			 * not reachable: nothing is finer-granularity than a tuple, so we
+ 			 * should never try to promote to it.
+ 			 */
+ 			Assert(false);
+ 			return 0;
+ 	}
+ 
+ 	/* not reachable */
+ 	Assert(false);
+ 	return 0;
+ }
+ 
+ /*
+  * For all ancestors of a newly-acquired predicate lock, increment
+  * their child count in the parent hash table. If any of them have
+  * more descendants than their promotion threshold, acquire the
+  * coarsest such lock.
+  *
+  * Returns true if a parent lock was acquired and false otherwise.
+  */
+ static bool
+ CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+ {
+ 	PREDICATELOCKTARGETTAG targettag,
+ 				nexttag,
+ 				promotiontag;
+ 	LOCALPREDICATELOCK *parentlock;
+ 	bool		found,
+ 				promote;
+ 
+ 	promote = false;
+ 
+ 	targettag = *reqtag;
+ 
+ 	/* check parents iteratively */
+ 	while (GetParentPredicateLockTag(&targettag, &nexttag))
+ 	{
+ 		targettag = nexttag;
+ 		parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ 														&targettag,
+ 														HASH_ENTER,
+ 														&found);
+ 		if (!found)
+ 		{
+ 			parentlock->held = false;
+ 			parentlock->childLocks = 1;
+ 		}
+ 		else
+ 			parentlock->childLocks++;
+ 
+ 		if (parentlock->childLocks >=
+ 			PredicateLockPromotionThreshold(&targettag))
+ 		{
+ 			/*
+ 			 * We should promote to this parent lock. Continue to check its
+ 			 * ancestors, however, both to get their child counts right and to
+ 			 * check whether we should just go ahead and promote to one of
+ 			 * them.
+ 			 */
+ 			promotiontag = targettag;
+ 			promote = true;
+ 		}
+ 	}
+ 
+ 	if (promote)
+ 	{
+ 		/* acquire coarsest ancestor eligible for promotion */
+ 		PredicateLockAcquire(&promotiontag);
+ 		return true;
+ 	}
+ 	else
+ 		return false;
+ }
+ 
+ /*
+  * When releasing a lock, decrement the child count on all ancestor
+  * locks.
+  *
+  * This is called only when releasing a lock via
+  * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+  * we've acquired its parent, possibly due to promotion) or when a new
+  * MVCC write lock makes the predicate lock unnecessary. There's no
+  * point in calling it when locks are released at transaction end, as
+  * this information is no longer needed.
+  */
+ static void
+ DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	PREDICATELOCKTARGETTAG parenttag,
+ 				nexttag;
+ 
+ 	parenttag = *targettag;
+ 
+ 	while (GetParentPredicateLockTag(&parenttag, &nexttag))
+ 	{
+ 		uint32		targettaghash;
+ 		LOCALPREDICATELOCK *parentlock,
+ 				   *rmlock;
+ 
+ 		parenttag = nexttag;
+ 		targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+ 		parentlock = (LOCALPREDICATELOCK *)
+ 			hash_search_with_hash_value(LocalPredicateLockHash,
+ 										&parenttag, targettaghash,
+ 										HASH_FIND, NULL);
+ 		Assert(parentlock != NULL);
+ 		parentlock->childLocks--;
+ 
+ 		Assert(parentlock->childLocks >= 0);
+ 
+ 		if ((parentlock->childLocks == 0) && (!parentlock->held))
+ 		{
+ 			rmlock = (LOCALPREDICATELOCK *)
+ 				hash_search_with_hash_value(LocalPredicateLockHash,
+ 											&parenttag, targettaghash,
+ 											HASH_REMOVE, NULL);
+ 			Assert(rmlock == parentlock);
+ 		}
+ 	}
+ }
+ 
+ /*
+  * Acquire a predicate lock on the specified target for the current
+  * connection if not already held.	Create related serializable transaction
+  * and predicate lock target entries first if missing.
+  */
+ static void
+ PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	uint32		targettaghash;
+ 	LWLockId	partitionLock;
+ 	bool		found;
+ 	PREDICATELOCKTARGET *target;
+ 	PREDICATELOCKTAG locktag;
+ 	PREDICATELOCK *lock;
+ 	LOCALPREDICATELOCK *locallock;
+ 
+ 	EnsureMySerializableXidExists();
+ 
+ 	/* Do we have the lock already, or a covering lock? */
+ 	if (PredicateLockExists(targettag))
+ 		return;
+ 
+ 	if (CoarserLockCovers(targettag))
+ 		return;
+ 
+ 	/* the same hash and LW lock apply to the lock target and the local lock. */
+ 	targettaghash = PredicateLockTargetTagHashCode(targettag);
+ 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 
+ 	/* Acquire lock in local table */
+ 	locallock = (LOCALPREDICATELOCK *)
+ 		hash_search_with_hash_value(LocalPredicateLockHash,
+ 									targettag, targettaghash,
+ 									HASH_ENTER, &found);
+ 	/* We should not hold the lock (but its entry might still exist) */
+ 	Assert(!found || !locallock->held);
+ 	locallock->held = true;
+ 	if (!found)
+ 		locallock->childLocks = 0;
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 
+ 	/* Make sure that the target is represented. */
+ 	target = (PREDICATELOCKTARGET *)
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									targettag, targettaghash,
+ 									HASH_ENTER, &found);
+ 	if (!target)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 	if (!found)
+ 		SHMQueueInit(&(target->predicateLocks));
+ 
+ 	/* We've got the sxact and target, make sure they're joined. */
+ 	locktag.myTarget = target;
+ 	locktag.myXact = (SERIALIZABLEXACT *) MySerializableXact;
+ 	lock = (PREDICATELOCK *)
+ 		hash_search_with_hash_value(PredicateLockHash, &locktag,
+ 			PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+ 									HASH_ENTER, &found);
+ 	if (!lock)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OUT_OF_MEMORY),
+ 				 errmsg("out of shared memory"),
+ 				 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 
+ 	if (!found)
+ 	{
+ 		SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink));
+ 		SHMQueueInsertBefore((SHM_QUEUE *) &(MySerializableXact->predicateLocks), &(lock->xactLink));
+ 	}
+ 
+ 	LWLockRelease(partitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 	/*
+ 	 * Lock has been acquired. Check whether it should be promoted to a
+ 	 * coarser granularity, or whether there are finer-granularity locks to
+ 	 * clean up.
+ 	 */
+ 	if (CheckAndPromotePredicateLockRequest(targettag))
+ 	{
+ 		/*
+ 		 * Lock request was promoted to a coarser-granularity lock, and that
+ 		 * lock was acquired. It will delete this lock and any of its
+ 		 * children, so we're done.
+ 		 */
+ 	}
+ 	else
+ 	{
+ 		/* Clean up any finer-granularity locks */
+ 		if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+ 			DeleteChildTargetLocks(targettag);
+ 	}
+ }
+ 
+ 
+ /*
+  *		PredicateLockRelation
+  *
+  * Gets a predicate lock at the relation level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  * Clear any finer-grained predicate locks this session has on the relation.
+  */
+ void
+ PredicateLockRelation(const Relation relation)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id);
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockPage
+  *
+  * Gets a predicate lock at the page level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  * Skip if a coarser predicate lock already covers this page.
+  * Clear any finer-grained predicate locks this session has on the relation.
+  */
+ void
+ PredicateLockPage(const Relation relation, const BlockNumber blkno)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									blkno);
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockTuple
+  *
+  * Gets a predicate lock at the tuple level.
+  * Skip if not in full serializable transaction isolation level.
+  * Skip if this is a temporary table.
+  */
+ void
+ PredicateLockTuple(const Relation relation, const HeapTuple tuple)
+ {
+ 	PREDICATELOCKTARGETTAG tag;
+ 	ItemPointer tid;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	/*
+ 	 * If it's a heap tuple, return if this xact wrote it.  It might be useful
+ 	 * to pass in the xmin from the tuple as another parameter.
+ 	 */
+ 	if (relation->rd_index == NULL)
+ 	{
+ 		SERIALIZABLEXIDTAG sxidtag;
+ 		SERIALIZABLEXID *sxid;
+ 
+ 		sxidtag.xid = HeapTupleHeaderGetXmin(tuple->t_data);
+ 		LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		sxid = (SERIALIZABLEXID *)
+ 			hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ 		if (sxid)
+ 		{
+ 			if (sxid->myXact == MySerializableXact)
+ 			{
+ 				/* We wrote it; we already have a write lock. */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				return;
+ 			}
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 	}
+ 
+ 	tid = &(tuple->t_self);
+ 	SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+ 									 relation->rd_node.dbNode,
+ 									 relation->rd_id,
+ 									 ItemPointerGetBlockNumber(tid),
+ 									 ItemPointerGetOffsetNumber(tid));
+ 	PredicateLockAcquire(&tag);
+ }
+ 
+ /*
+  *		PredicateLockPageSplit
+  *
+  * Copies any predicate locks for the old page to the new page.
+  * Skip if this is a temporary table or toast table.
+  *
+  * NOTE: A page split (or overflow) affects all serializable transactions,
+  * even if it occurrs in the context of another transaction isolation level.
+  *
+  * NOTE: This currently leaves the local copy of the locks without
+  * information on the new lock which is in shared memory.  This could cause
+  * problems if enough page splits occur on locked pages without the processes
+  * which hold the locks getting in and noticing.
+  */
+ void
+ PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno)
+ {
+ 	PREDICATELOCKTARGETTAG oldtargettag;
+ 	PREDICATELOCKTARGETTAG newtargettag;
+ 	uint32		oldtargettaghash;
+ 	LWLockId	oldpartitionLock;
+ 	PREDICATELOCKTARGET *oldtarget;
+ 	uint32		newtargettaghash;
+ 	LWLockId	newpartitionLock;
+ 
+ 	if (SkipSplitTracking(relation))
+ 		return;
+ 
+ 	Assert(oldblkno != newblkno);
+ 	Assert(BlockNumberIsValid(oldblkno));
+ 	Assert(BlockNumberIsValid(newblkno));
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									oldblkno);
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									newblkno);
+ 
+ 	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ 	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * We must get the partition locks in ascending sequence to avoid
+ 	 * deadlocks. If old and new partitions are the same, we must request the
+ 	 * lock only once.
+ 	 */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockAcquire(oldpartitionLock, LW_SHARED);
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(oldpartitionLock, LW_SHARED);
+ 	}
+ 	else
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Look for the old target.  If not found, that's OK; no predicate locks
+ 	 * are affected, so we can just clean up and return. If it does exist,
+ 	 * walk its list of predicate locks and create new ones for the new block
+ 	 * number.
+ 	 */
+ 	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 											&oldtargettag,
+ 											oldtargettaghash,
+ 											HASH_FIND, NULL);
+ 	if (oldtarget)
+ 	{
+ 		PREDICATELOCKTARGET *newtarget;
+ 		bool		found;
+ 		PREDICATELOCK *oldpredlock;
+ 		PREDICATELOCKTAG newpredlocktag;
+ 
+ 		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 												&newtargettag,
+ 												newtargettaghash,
+ 												HASH_ENTER, &found);
+ 		Assert(!found);
+ 		if (!newtarget)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of shared memory"),
+ 					 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 		SHMQueueInit(&(newtarget->predicateLocks));
+ 
+ 		newpredlocktag.myTarget = newtarget;
+ 
+ 		oldpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(oldtarget->predicateLocks),
+ 						 &(oldtarget->predicateLocks),
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		while (oldpredlock)
+ 		{
+ 			SHM_QUEUE  *predlocktargetlink;
+ 			PREDICATELOCK *nextpredlock;
+ 			PREDICATELOCK *newpredlock;
+ 
+ 			predlocktargetlink = &(oldpredlock->targetLink);
+ 			nextpredlock = (PREDICATELOCK *)
+ 				SHMQueueNext(&(oldtarget->predicateLocks),
+ 							 predlocktargetlink,
+ 							 offsetof(PREDICATELOCK, targetLink));
+ 			newpredlocktag.myXact = oldpredlock->tag.myXact;
+ 
+ 			newpredlock = (PREDICATELOCK *)
+ 				hash_search_with_hash_value(PredicateLockHash,
+ 											&newpredlocktag,
+ 											PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, newtargettaghash),
+ 											HASH_ENTER, &found);
+ 			if (!newpredlock)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_OUT_OF_MEMORY),
+ 						 errmsg("out of shared memory"),
+ 						 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 			Assert(!found);
+ 			SHMQueueInsertBefore(&(newtarget->predicateLocks), &(newpredlock->targetLink));
+ 			SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks), &(newpredlock->xactLink));
+ 
+ 			oldpredlock = nextpredlock;
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 	}
+ 
+ 	/* Release partition locks in reverse order of acquisition. */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockRelease(newpartitionLock);
+ 		LWLockRelease(oldpartitionLock);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockRelease(oldpartitionLock);
+ 		LWLockRelease(newpartitionLock);
+ 	}
+ 	else
+ 		LWLockRelease(newpartitionLock);
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ }
+ 
+ /*
+  *		PredicateLockPageCombine
+  *
+  * Combines predicate locks for two existing pages.
+  * Skip if this is a temporary table or toast table.
+  *
+  * NOTE: A page combine affects all serializable
+  * transactions, even if it occurrs in the context of another
+  * transaction isolation level.
+  */
+ void
+ PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno)
+ {
+ 	PREDICATELOCKTARGETTAG oldtargettag;
+ 	PREDICATELOCKTARGETTAG newtargettag;
+ 	uint32		oldtargettaghash;
+ 	LWLockId	oldpartitionLock;
+ 	PREDICATELOCKTARGET *oldtarget;
+ 	uint32		newtargettaghash;
+ 	LWLockId	newpartitionLock;
+ 
+ 	if (SkipSplitTracking(relation))
+ 		return;
+ 
+ 	Assert(oldblkno != newblkno);
+ 	Assert(BlockNumberIsValid(oldblkno));
+ 	Assert(BlockNumberIsValid(newblkno));
+ 
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									oldblkno);
+ 	SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ 									relation->rd_node.dbNode,
+ 									relation->rd_id,
+ 									newblkno);
+ 
+ 	oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ 	newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ 	oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ 	newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * We must get the partition locks in ascending sequence to avoid
+ 	 * deadlocks. If old and new partitions are the same, we must request the
+ 	 * lock only once.
+ 	 */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 		LWLockAcquire(oldpartitionLock, LW_EXCLUSIVE);
+ 	}
+ 	else
+ 		LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Look for the old target.  If not found, that's OK; no predicate locks
+ 	 * are affected, so we can just clean up and return. If it does exist,
+ 	 * walk its list of predicate locks and create new ones for the new block
+ 	 * number, while deleting the old ones.
+ 	 */
+ 	oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 											&oldtargettag,
+ 											oldtargettaghash,
+ 											HASH_FIND, NULL);
+ 	if (oldtarget)
+ 	{
+ 		PREDICATELOCKTARGET *newtarget;
+ 		PREDICATELOCK *oldpredlock;
+ 		PREDICATELOCKTAG newpredlocktag;
+ 
+ 		newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ 												&newtargettag,
+ 												newtargettaghash,
+ 												HASH_FIND, NULL);
+ 		Assert(newtarget);
+ 
+ 		newpredlocktag.myTarget = newtarget;
+ 
+ 		oldpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(oldtarget->predicateLocks),
+ 						 &(oldtarget->predicateLocks),
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 		LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 		while (oldpredlock)
+ 		{
+ 			SHM_QUEUE  *predlocktargetlink;
+ 			PREDICATELOCK *nextpredlock;
+ 			PREDICATELOCK *newpredlock;
+ 			bool		found;
+ 
+ 			predlocktargetlink = &(oldpredlock->targetLink);
+ 			nextpredlock = (PREDICATELOCK *)
+ 				SHMQueueNext(&(oldtarget->predicateLocks),
+ 							 predlocktargetlink,
+ 							 offsetof(PREDICATELOCK, targetLink));
+ 			newpredlocktag.myXact = oldpredlock->tag.myXact;
+ 
+ 			hash_search_with_hash_value(PredicateLockHash,
+ 										&oldpredlock->tag,
+ 										PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag, oldtargettaghash),
+ 										HASH_REMOVE, NULL);
+ 
+ 			newpredlock = (PREDICATELOCK *)
+ 				hash_search_with_hash_value(PredicateLockHash,
+ 											&newpredlocktag,
+ 											PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, newtargettaghash),
+ 											HASH_ENTER, &found);
+ 			if (!newpredlock)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_OUT_OF_MEMORY),
+ 						 errmsg("out of shared memory"),
+ 						 errhint("You might need to increase max_predicate_locks_per_transaction.")));
+ 			if (!found)
+ 			{
+ 				SHMQueueInsertBefore(&(newtarget->predicateLocks), &(newpredlock->targetLink));
+ 				SHMQueueInsertBefore((SHM_QUEUE *) &(newpredlocktag.myXact->predicateLocks), &(newpredlock->xactLink));
+ 			}
+ 
+ 			oldpredlock = nextpredlock;
+ 		}
+ 		LWLockRelease(SerializableXactHashLock);
+ 		Assert(SHMQueueIsDetached(&oldtarget->predicateLocks));
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									&oldtargettag,
+ 									oldtargettaghash,
+ 									HASH_REMOVE, NULL);
+ 	}
+ 
+ 	/* Release partition locks in reverse order of acquisition. */
+ 	if (oldpartitionLock < newpartitionLock)
+ 	{
+ 		LWLockRelease(newpartitionLock);
+ 		LWLockRelease(oldpartitionLock);
+ 	}
+ 	else if (oldpartitionLock > newpartitionLock)
+ 	{
+ 		LWLockRelease(oldpartitionLock);
+ 		LWLockRelease(newpartitionLock);
+ 	}
+ 	else
+ 		LWLockRelease(newpartitionLock);
+ }
+ 
+ /*
+  * Walk the hash table and find the new xmin.
+  */
+ static void
+ SetNewSerializableGlobalXmin(void)
+ {
+ 	HASH_SEQ_STATUS seqstat;
+ 	SERIALIZABLEXACT *sxact;
+ 
+ 	SerializableGlobalXmin = InvalidTransactionId;
+ 	SerializableGlobalXminCount = 0;
+ 	hash_seq_init(&seqstat, SerializableXactHash);
+ 	while ((sxact = (SERIALIZABLEXACT *) hash_seq_search(&seqstat)))
+ 	{
+ 		if (!SxactIsOnFinishedList(sxact))
+ 		{
+ 			if (!TransactionIdIsValid(SerializableGlobalXmin)
+ 				|| TransactionIdPrecedes(sxact->xmin, SerializableGlobalXmin))
+ 			{
+ 				SerializableGlobalXmin = sxact->xmin;
+ 				SerializableGlobalXminCount = 1;
+ 			}
+ 			else if (sxact->xmin == SerializableGlobalXmin)
+ 				SerializableGlobalXminCount++;
+ 		}
+ 	}
+ }
+ 
+ /*
+  *		ReleasePredicateLocks
+  *
+  * Releases predicate locks based on completion of the current
+  * transaction, whether committed or rolled back.
+  *
+  * We do nothing unless this is a serializable transaction.
+  *
+  * For a rollback, the current transaction's predicate locks could be
+  * immediately released; however, we may still have conflict pointers to
+  * our transaction which could be expensive to find and eliminate right
+  * now, so we flag it as rolled back so that it will be ignored, and let
+  * cleanup happen later.
+  *
+  * This method must ensure that shared memory hash tables are cleaned
+  * up in some relatively timely fashion.
+  *
+  * If this transaction is committing and is holding any predicate locks,
+  * it must be added to a list of completed serializable transaction still
+  * holding locks.
+  */
+ void
+ ReleasePredicateLocks(const bool isCommit)
+ {
+ 	bool		needToClear;
+ 
+ 	if (MySerializableXact == InvalidSerializableXact)
+ 	{
+ 		Assert(LocalPredicateLockHash == NULL);
+ 		return;
+ 	}
+ 
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 
+ 	/* We'd better not already be on the cleanup list. */
+ 	Assert(!SxactIsOnFinishedList((SERIALIZABLEXACT *) MySerializableXact));
+ 
+ 	/*
+ 	 * If it's not a commit it's a rollback, and we can clear our locks
+ 	 * immediately.  TODO SSI: Clear the locks, but leave the sxact record.
+ 	 */
+ 	if (!isCommit)
+ 		MySerializableXact->rolledBack = true;
+ 
+ 	/*
+ 	 * Add this to the list of transactions to check for later cleanup. First
+ 	 * turn pointers to already-terminated transactions to self-references.
+ 	 */
+ 	if (MySerializableXact->inConflict != InvalidSerializableXact)
+ 	{
+ 		if (MySerializableXact->inConflict->rolledBack)
+ 			MySerializableXact->inConflict = InvalidSerializableXact;
+ 		else if (SxactIsCommitted(MySerializableXact->inConflict))
+ 			MySerializableXact->inConflict = (SERIALIZABLEXACT *) MySerializableXact;
+ 	}
+ 	if (MySerializableXact->outConflict != InvalidSerializableXact)
+ 	{
+ 		if (MySerializableXact->outConflict->rolledBack)
+ 			MySerializableXact->outConflict = InvalidSerializableXact;
+ 		else if (SxactIsCommitted(MySerializableXact->outConflict))
+ 			MySerializableXact->outConflict = (SERIALIZABLEXACT *) MySerializableXact;
+ 	}
+ 
+ 	/* Add this to the list of transactions to check for later cleanup. */
+ 	LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+ 	SHMQueueInsertBefore(FinishedSerializableTransactions,
+ 						 (SHM_QUEUE *) &(MySerializableXact->finishedLink));
+ 	LWLockRelease(SerializableFinishedListLock);
+ 
+ 	/*
+ 	 * Check whether it's time to clean up old transactions. This can only be
+ 	 * done when the last serializable transaction with the oldest xmin among
+ 	 * serializable transactions completes.  We then find the "new oldest"
+ 	 * xmin and purge any transactions which finished before this transaction
+ 	 * was launched.
+ 	 */
+ 	needToClear = false;
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	if (TransactionIdPrecedes(SerializableGlobalXmin, RecentGlobalXmin))
+ 	{
+ 		SetNewSerializableGlobalXmin();
+ 		needToClear = true;
+ 	}
+ 	else if (MySerializableXact->xmin == SerializableGlobalXmin)
+ 	{
+ 		Assert(SerializableGlobalXminCount > 0);
+ 		if (--SerializableGlobalXminCount == 0)
+ 		{
+ 			SetNewSerializableGlobalXmin();
+ 			needToClear = true;
+ 		}
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	if (needToClear)
+ 		ClearOldPredicateLocks();
+ 
+ 	MySerializableXact = InvalidSerializableXact;
+ 	MyXid = InvalidTransactionId;
+ 
+ 	/* Delete per-transaction lock table */
+ 	hash_destroy(LocalPredicateLockHash);
+ 	LocalPredicateLockHash = NULL;
+ }
+ 
+ /*
+  * Clear old predicate locks.
+  */
+ static void
+ ClearOldPredicateLocks(void)
+ {
+ 	SERIALIZABLEXACT *finishedSxact;
+ 
+ 	if (!LWLockConditionalAcquire(SerializableFinishedListLock, LW_EXCLUSIVE))
+ 		return;
+ 
+ 	finishedSxact = (SERIALIZABLEXACT *)
+ 		SHMQueueNext(FinishedSerializableTransactions,
+ 					 FinishedSerializableTransactions,
+ 					 offsetof(SERIALIZABLEXACT, finishedLink));
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	while (finishedSxact)
+ 	{
+ 		SERIALIZABLEXACT *nextSxact;
+ 
+ 		nextSxact = (SERIALIZABLEXACT *)
+ 			SHMQueueNext(FinishedSerializableTransactions,
+ 						 &(finishedSxact->finishedLink),
+ 						 offsetof(SERIALIZABLEXACT, finishedLink));
+ 		if (!TransactionIdIsValid(SerializableGlobalXmin)
+ 			|| TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+ 											 SerializableGlobalXmin))
+ 		{
+ 			LWLockRelease(SerializableXactHashLock);
+ 			ReleaseOneSerializableXact(finishedSxact);
+ 			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		}
+ 		finishedSxact = nextSxact;
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 	LWLockRelease(SerializableFinishedListLock);
+ }
+ 
+ /*
+  * This is the normal way to delete anything from any of the predicate
+  * locking hash tables.  Given a transaction which we know can be deleted,
+  * delete all predicate locks held by that transaction, and any predicate
+  * lock targets which are now unreferenced by a lock; delete all xid values
+  * for the transaction; then delete the transaction.
+  */
+ static void
+ ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact)
+ {
+ 	PREDICATELOCK *predlock;
+ 	SERIALIZABLEXID *sxid;
+ 
+ 	Assert(sxact != NULL);
+ 	Assert(sxact->rolledBack || SxactIsCommitted(sxact));
+ 	Assert(SxactIsOnFinishedList(sxact));
+ 
+ 	LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(sxact->predicateLocks),
+ 					 &(sxact->predicateLocks),
+ 					 offsetof(PREDICATELOCK, xactLink));
+ 	while (predlock)
+ 	{
+ 		PREDICATELOCK *nextpredlock;
+ 		PREDICATELOCKTAG tag;
+ 		SHM_QUEUE  *targetLink;
+ 		PREDICATELOCKTARGET *target;
+ 		PREDICATELOCKTARGETTAG targettag;
+ 		uint32		targettaghash;
+ 		LWLockId	partitionLock;
+ 
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(sxact->predicateLocks),
+ 						 &(predlock->xactLink),
+ 						 offsetof(PREDICATELOCK, xactLink));
+ 
+ 		tag = predlock->tag;
+ 		targetLink = &(predlock->targetLink);
+ 		target = tag.myTarget;
+ 		targettag = target->tag;
+ 		targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ 		partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 
+ 		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 		SHMQueueDelete(targetLink);
+ 
+ 		/*
+ 		 * No need to do retail removal from transaction object; it's going
+ 		 * away.
+ 		 */
+ 		hash_search_with_hash_value(PredicateLockHash, &tag,
+ 								PredicateLockHashCodeFromTargetHashCode(&tag,
+ 															  targettaghash),
+ 									HASH_REMOVE, NULL);
+ 		if (SHMQueueEmpty(&target->predicateLocks))
+ 			hash_search_with_hash_value(PredicateLockTargetHash,
+ 							   &targettag, targettaghash, HASH_REMOVE, NULL);
+ 		LWLockRelease(partitionLock);
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 	/* Get rid of the xids and the record of the transaction itself. */
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	sxid = (SERIALIZABLEXID *)
+ 		SHMQueueNext(&(sxact->xids),
+ 					 &(sxact->xids),
+ 					 offsetof(SERIALIZABLEXID, xactLink));
+ 	while (sxid)
+ 	{
+ 		SERIALIZABLEXID *nextsxid;
+ 		SERIALIZABLEXIDTAG tag;
+ 
+ 		nextsxid = (SERIALIZABLEXID *)
+ 			SHMQueueNext(&(sxact->xids),
+ 						 &(sxid->xactLink),
+ 						 offsetof(SERIALIZABLEXID, xactLink));
+ 		tag = sxid->tag;
+ 		hash_search(SerializableXidHash, &tag, HASH_REMOVE, NULL);
+ 
+ 		/*
+ 		 * No need to do retail removal from transaction object; it's going
+ 		 * away.
+ 		 */
+ 		sxid = nextsxid;
+ 	}
+ 	SHMQueueDelete(&(sxact->finishedLink));
+ 	hash_search(SerializableXactHash, &(sxact->tag), HASH_REMOVE, NULL);
+ 	LWLockRelease(SerializableXactHashLock);
+ }
+ 
+ /*
+  * Tests whether the given transaction is concurrent with (overlaps)
+  * our current transaction.
+  */
+ static bool
+ XidIsConcurrent(TransactionId xid)
+ {
+ 	Snapshot	snap;
+ 	uint32		i;
+ 
+ 	Assert(TransactionIdIsValid(xid));
+ 
+ 	/*
+ 	 * We don't count our own transaction or its subtransactions as
+ 	 * "concurrent".
+ 	 */
+ 	if (xid == GetTopTransactionIdIfAny())
+ 		return false;
+ 
+ 	snap = GetTransactionSnapshot();
+ 
+ 	if (TransactionIdPrecedes(xid, snap->xmin))
+ 		return false;
+ 
+ 	if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+ 		return true;
+ 
+ 	for (i = 0; i < snap->xcnt; i++)
+ 	{
+ 		if (xid == snap->xip[i])
+ 			return true;
+ 	}
+ 
+ 	return false;
+ }
+ 
+ /*
+  * CheckForSerializableConflictOut
+  *		We are reading a tuple which has been modified.  If it is visible to
+  *		us but has been deleted, that indicates a rw-conflict out.	If it's
+  *		not visible and was created by a concurrent (overlapping)
+  *		serializable transaction, that is also a rw-conflict out,
+  *
+  * The heap tables which we maintain for predicate locking will also be used
+  * to determine that the xmin from a row is related to a serializable
+  * transaction, and will provide a mapping to the top level transaction.
+  *
+  * This function should be called just about anywhere in heapam.c that a
+  * tuple has been read.
+  */
+ void
+ CheckForSerializableConflictOut(const bool valid, const Relation relation,
+ 								const HeapTuple tuple, const Buffer buffer)
+ {
+ 	TransactionId xid;
+ 	SERIALIZABLEXIDTAG sxidtag;
+ 	SERIALIZABLEXID *sxid;
+ 	SERIALIZABLEXACTTAG sxacttag;
+ 	SERIALIZABLEXACT *sxact;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	if (valid)
+ 	{
+ 		/*----------------------------------------------------------------
+ 		 * TODO SSI: Figure out why the ItemPointerIsValid test is needed.
+ 		 *			 We are sometimes failing with ip_posid == 0 in corner
+ 		 *			 cases, like the following.  Is this some underlying bug?
+ 		 *			 If not, is this the best way to handle this?
+ 		 *
+ 		 *	-- setup
+ 		 *	drop table ctl, receipt;
+ 		 *	create table ctl (k text not null primary key, deposit_date date not null);
+ 		 *	insert into ctl values ('receipt', date '2008-12-22');
+ 		 *	create table receipt (receipt_no int not null primary key, deposit_date date not null, amount numeric(13,2));
+ 		 *	insert into receipt values (1, (select deposit_date from ctl where k = 'receipt'), 1.00);
+ 		 *	insert into receipt values (2, (select deposit_date from ctl where k = 'receipt'), 2.00);
+ 		 *
+ 		 *	-- connection 1
+ 		 *	start transaction isolation level serializable ;
+ 		 *	insert into receipt values (3, (select deposit_date from ctl where k = 'receipt'), 4.00);
+ 		 *
+ 		 *	-- connection 2
+ 		 *	start transaction isolation level serializable ;
+ 		 *	update ctl set deposit_date = date '2008-12-23' where k = 'receipt';
+ 		 *
+ 		 *	-- connection 3
+ 		 *	start transaction isolation level serializable ;
+ 		 *	select * from ctl;
+ 		 *
+ 		 *	-- connection 2
+ 		 *	rollback;
+ 		 *
+ 		 *	-- connection 3
+ 		 *	select * from re<Tab><Tab>[nothing shows]ceipt;
+ 		 *	> no connection to the server
+ 		 *	> The connection to the server was lost. Attempting reset: Succeeded.
+ 		 *----------------------------------------------------------------
+ 		 */
+ 		/* If there's a new tuple to key on, return to avoid duplicate work. */
+ 		if (ItemPointerIsValid(&(tuple->t_data->t_ctid))
+ 			&& !ItemPointerEquals(&(tuple->t_self), &(tuple->t_data->t_ctid)))
+ 			return;
+ 
+ 		/*
+ 		 * We may bail out if previous xmax aborted, or if it committed but
+ 		 * only locked the tuple without updating it.
+ 		 */
+ 		if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED))
+ 			return;
+ 
+ 		/*
+ 		 * If there's a valid xmax, it must be from a concurrent transaction,
+ 		 * since it deleted a tuple which is visible to us.
+ 		 */
+ 		xid = HeapTupleHeaderGetXmax(tuple->t_data);
+ 		if (!TransactionIdIsValid(xid))
+ 			return;
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * We would read this row, but it isn't visible to us.
+ 		 */
+ 		xid = HeapTupleHeaderGetXmin(tuple->t_data);
+ 	}
+ 
+ 	/*
+ 	 * It's OK to look for conflicts with a share lock, and record them with
+ 	 * an exclusive lock when found; we just have to release the shared lock
+ 	 * before attempting to get the other lock, to prevent deadlocks.  We will
+ 	 * need to recheck that the entry still exists after getting the stronger
+ 	 * lock, just in case it rolled back in the window where we weren't
+ 	 * holding a lock.
+ 	 */
+ 	sxidtag.xid = xid;
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	sxid = (SERIALIZABLEXID *)
+ 		hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ 	if (!sxid)
+ 	{
+ 		/* It's not serializable or otherwise not important. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 	sxact = sxid->myXact;
+ 	if (sxact == MySerializableXact || sxact->rolledBack)
+ 	{
+ 		/* We can't conflict with our own transaction or one rolled back. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * If this is a read-only transaction and the writing transaction has
+ 	 * committed, and it doesn't have a rw-conflict out or has a conflict out
+ 	 * to a transaction which overlaps this transaction, then no conflict.
+ 	 */
+ 	if (XactReadOnly
+ 		&& SxactIsCommitted(sxact)
+ 		&& (!TransactionIdIsValid(sxact->outConflict)
+ 			|| (sxact != sxact->outConflict
+ 				&& (!SxactIsCommitted(sxact->outConflict)
+ 					|| XidIsConcurrent(sxact->outConflict->topXid)))))
+ 	{
+ 		/* Read-only transaction will appear to run first.	No conflict. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	sxacttag = sxact->tag;
+ 	LWLockRelease(SerializableXactHashLock);
+ 
+ 	/*
+ 	 * Make sure we have somewhere to record a conflict against this
+ 	 * transaction.
+ 	 */
+ 	EnsureMySerializableXidExists();
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 	sxact = (SERIALIZABLEXACT *)
+ 		hash_search(SerializableXactHash, &sxacttag, HASH_FIND, NULL);
+ 	if (!sxact)
+ 	{
+ 		/* It must have been cleaned up, which means it wasn't useful. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 	xid = sxact->topXid;
+ 	if (!XidIsConcurrent(xid))
+ 	{
+ 		/* This write was already in our snapshot; no conflict. */
+ 		LWLockRelease(SerializableXactHashLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Flag the conflict.  But first, if this conflict creates a dangerous
+ 	 * structure, ereport an error.
+ 	 */
+ 	FlagRWConflict((SERIALIZABLEXACT *) MySerializableXact, sxact);
+ 	LWLockRelease(SerializableXactHashLock);
+ }
+ 
+ /*
+  * Check a particular target for rw-dependency conflict in.
+  */
+ static void
+ CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+ {
+ 	uint32		targettaghash;
+ 	LWLockId	partitionLock;
+ 	PREDICATELOCKTARGET *target;
+ 	PREDICATELOCK *predlock;
+ 
+ 	Assert(MySerializableXact != InvalidSerializableXact);
+ 
+ 	/* The same hash and LW lock apply to the lock target and the lock itself. */
+ 	targettaghash = PredicateLockTargetTagHashCode(targettag);
+ 	partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ 	LWLockAcquire(partitionLock, LW_SHARED);
+ 	target = (PREDICATELOCKTARGET *)
+ 		hash_search_with_hash_value(PredicateLockTargetHash,
+ 									targettag, targettaghash,
+ 									HASH_FIND, NULL);
+ 	if (!target)
+ 	{
+ 		/* Nothing has this target locked; we're done here. */
+ 		LWLockRelease(partitionLock);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Each lock for an overlapping transaction represents a conflict: a
+ 	 * rw-dependency in to this transaction.
+ 	 */
+ 	predlock = (PREDICATELOCK *)
+ 		SHMQueueNext(&(target->predicateLocks),
+ 					 &(target->predicateLocks),
+ 					 offsetof(PREDICATELOCK, targetLink));
+ 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 	while (predlock)
+ 	{
+ 		SHM_QUEUE  *predlocktargetlink;
+ 		PREDICATELOCK *nextpredlock;
+ 		SERIALIZABLEXACT *sxact;
+ 
+ 		predlocktargetlink = &(predlock->targetLink);
+ 		nextpredlock = (PREDICATELOCK *)
+ 			SHMQueueNext(&(target->predicateLocks),
+ 						 predlocktargetlink,
+ 						 offsetof(PREDICATELOCK, targetLink));
+ 
+ 		sxact = predlock->tag.myXact;
+ 		if (sxact == MySerializableXact)
+ 		{
+ 			/*
+ 			 * If we're getting a write lock on the tuple, we don't need a
+ 			 * predicate (SIREAD) lock. At this point our transaction already
+ 			 * has an ExclusiveRowLock on the relation, so we are OK to drop
+ 			 * the predicate lock on the tuple, if found, without fearing that
+ 			 * another write against the tuple will occur before the MVCC
+ 			 * information makes it to the buffer.
+ 			 */
+ 			if (GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+ 			{
+ 				uint32		predlockhashcode;
+ 				PREDICATELOCKTARGET *rmtarget = NULL;
+ 				PREDICATELOCK *rmpredlock;
+ 				LOCALPREDICATELOCK *locallock,
+ 						   *rmlocallock;
+ 
+ 				/*
+ 				 * This is a tuple on which we have a tuple predicate lock. We
+ 				 * only have shared LW locks now; release those, and get
+ 				 * exclusive locks only while we modify things.
+ 				 */
+ 				LWLockRelease(SerializableXactHashLock);
+ 				LWLockRelease(partitionLock);
+ 				LWLockAcquire(SerializablePredicateLockListLock, LW_SHARED);
+ 				LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ 				LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 				/*
+ 				 * Remove the predicate lock from shared memory, if it hasn't
+ 				 * been concurrently removed by an index page combine.
+ 				 */
+ 				predlockhashcode = PredicateLockHashCodeFromTargetHashCode(&(predlock->tag),
+ 															  targettaghash);
+ 				rmpredlock = (PREDICATELOCK *)
+ 					hash_search_with_hash_value(PredicateLockHash,
+ 												&(predlock->tag),
+ 												predlockhashcode,
+ 												HASH_FIND, NULL);
+ 				if (rmpredlock == predlock)
+ 				{
+ 					SHMQueueDelete(predlocktargetlink);
+ 					SHMQueueDelete(&(predlock->xactLink));
+ 
+ 					rmpredlock = (PREDICATELOCK *)
+ 						hash_search_with_hash_value(PredicateLockHash,
+ 													&(predlock->tag),
+ 													predlockhashcode,
+ 													HASH_REMOVE, NULL);
+ 					Assert(rmpredlock == predlock);
+ 
+ 					/*
+ 					 * When a target is no longer used, remove it.
+ 					 */
+ 					if (SHMQueueEmpty(&target->predicateLocks))
+ 					{
+ 						rmtarget = (PREDICATELOCKTARGET *)
+ 							hash_search_with_hash_value(PredicateLockTargetHash,
+ 														targettag,
+ 														targettaghash,
+ 														HASH_REMOVE, NULL);
+ 						Assert(rmtarget == target);
+ 					}
+ 
+ 					LWLockRelease(SerializableXactHashLock);
+ 					LWLockRelease(partitionLock);
+ 					LWLockRelease(SerializablePredicateLockListLock);
+ 
+ 					locallock = (LOCALPREDICATELOCK *)
+ 						hash_search_with_hash_value(LocalPredicateLockHash,
+ 													targettag, targettaghash,
+ 													HASH_FIND, NULL);
+ 					Assert(locallock != NULL);
+ 					Assert(locallock->held);
+ 					locallock->held = false;
+ 
+ 					if (locallock->childLocks == 0)
+ 					{
+ 						rmlocallock = (LOCALPREDICATELOCK *)
+ 							hash_search_with_hash_value(LocalPredicateLockHash,
+ 													targettag, targettaghash,
+ 														HASH_REMOVE, NULL);
+ 						Assert(rmlocallock == locallock);
+ 					}
+ 
+ 					DecrementParentLocks(targettag);
+ 
+ 					if (rmtarget)
+ 						return;
+ 
+ 					LWLockAcquire(partitionLock, LW_SHARED);
+ 					nextpredlock = (PREDICATELOCK *)
+ 						SHMQueueNext(&(target->predicateLocks),
+ 									 &(target->predicateLocks),
+ 									 offsetof(PREDICATELOCK, targetLink));
+ 
+ 					LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 				}
+ 				else
+ 				{
+ 					LWLockAcquire(partitionLock, LW_SHARED);
+ 					LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 				}
+ 			}
+ 		}
+ 		else if (!(sxact->rolledBack)
+ 				 && (!SxactIsCommitted(sxact)
+ 					 || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ 											  sxact->finishedBefore))
+ 				 && sxact->outConflict != MySerializableXact
+ 				 && MySerializableXact->inConflict != sxact)
+ 		{
+ 			LWLockRelease(SerializableXactHashLock);
+ 			LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 			FlagRWConflict(sxact, (SERIALIZABLEXACT *) MySerializableXact);
+ 
+ 			LWLockRelease(SerializableXactHashLock);
+ 			LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ 		}
+ 
+ 		predlock = nextpredlock;
+ 	}
+ 	LWLockRelease(SerializableXactHashLock);
+ 	LWLockRelease(partitionLock);
+ }
+ 
+ /*
+  * CheckForSerializableConflictIn
+  *		We are writing the given tuple.  If that indicates a rw-conflict
+  *		in from another serializable transaction, take appropriate action.
+  *
+  * Skip checking for any granularity for which a parameter is missing.
+  *
+  * A tuple update or delete is in conflict if we have a predicate lock
+  * against the relation or page in which the tuple exists, or against the
+  * tuple itself.  A tuple insert is in conflict only if there is a predicate
+  * lock against the entire relation.
+  *
+  * The call to this function also indicates that we need an entry in the
+  * serializable transaction hash table, so that this write's conflicts can
+  * be detected for the proper lifetime, which is until this transaction and
+  * all overlapping serializable transactions have completed.
+  */
+ void
+ CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple,
+ 							   const Buffer buffer)
+ {
+ 	PREDICATELOCKTARGETTAG targettag;
+ 
+ 	if (SkipSerialization(relation))
+ 		return;
+ 
+ 	EnsureMySerializableXidExists();
+ 
+ 	/*
+ 	 * It is important that we check for locks from the finest granularity to
+ 	 * the coarsest granularity, so that granularity promotion doesn't cause
+ 	 * us to miss a lock.  The new (coarser) lock will be acquired before the
+ 	 * old (finer) locks are released.
+ 	 *
+ 	 * It is not possible to take and hold a lock across the checks for all
+ 	 * granularities because each target could be in a separate partition.
+ 	 */
+ 	if (tuple != NULL)
+ 	{
+ 		SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+ 										 relation->rd_node.dbNode,
+ 										 relation->rd_id,
+ 						 ItemPointerGetBlockNumber(&(tuple->t_data->t_ctid)),
+ 					   ItemPointerGetOffsetNumber(&(tuple->t_data->t_ctid)));
+ 		CheckTargetForConflictsIn(&targettag);
+ 	}
+ 
+ 	if (BufferIsValid(buffer))
+ 	{
+ 		SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id,
+ 										BufferGetBlockNumber(buffer));
+ 		CheckTargetForConflictsIn(&targettag);
+ 	}
+ 
+ 	SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+ 										relation->rd_node.dbNode,
+ 										relation->rd_id);
+ 	CheckTargetForConflictsIn(&targettag);
+ }
+ 
+ /*
+  * Flag a rw-dependency between two serializable transactions.
+  * If a conflict field is invalid set it to the other transaction,
+  * if it's already the other transaction leave it alone, otherwise
+  * use self-reference (so we don't need to keep a list).
+  *
+  * The caller is responsible for ensuring that we have a LW lock on
+  * the transaction hash table.
+  */
+ static void
+ FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+ {
+ 	Assert(reader != writer);
+ 
+ 	/* First, see if this conflict causes failure. */
+ 	OnConflict_CheckForSerializationFailure(reader, writer);
+ 
+ 	/* Actually do the conflict flagging. */
+ 	if (writer->inConflict == InvalidSerializableXact
+ 		|| writer->inConflict->rolledBack)
+ 		writer->inConflict = reader;
+ 	else if (writer->inConflict != reader)
+ 		writer->inConflict = writer;
+ 	if (reader->outConflict == InvalidSerializableXact
+ 		|| reader->outConflict->rolledBack)
+ 		reader->outConflict = writer;
+ 	else if (reader->outConflict != writer)
+ 		reader->outConflict = reader;
+ }
+ 
+ /*
+  * Check whether we should roll back one of these transactions
+  * instead of flagging a conflict.
+  */
+ static void
+ OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ 										const SERIALIZABLEXACT *writer)
+ {
+ 	bool		failure;
+ 
+ 	Assert(LWLockHeldByMe(SerializableXactHashLock));
+ 
+ 	failure = false;
+ 
+ 	if (writer->inConflict != reader
+ 		&& writer->outConflict != InvalidSerializableXact
+ 		&& !(writer->outConflict->rolledBack))
+ 	{
+ 		/* The writer is or is becoming a pivot. */
+ 		/* Self-reference prevents checking commit sequence. */
+ 		if (writer->outConflict == writer
+ 
+ 		/*
+ 		 * TODO SSI: Resolve this performance tweak issue.
+ 		 *
+ 		 * Back-and-forth reference is write skew; thus doomed; however,
+ 		 * rolling back here increases chances that a retry will still fail.
+ 		 * It may be better to let it happen at commit time.  Only performance
+ 		 * testing can determine whether the next line should be used.
+ 		 *
+ 		 * Leaving it out would be *especially* valuable if the PreCommit
+ 		 * checking could be changed to allow a commit in a situation where it
+ 		 * is leaving another transaction in a state where a commit must fail
+ 		 * -- when the doomed transaction eventually tries to commit, it would
+ 		 * probably be at a time when an immediate retry is very likely to
+ 		 * succeed.
+ 		 */
+ 		/* || writer->outConflict == reader */
+ 			)
+ 			failure = true;
+ 		else if (SxactIsCommitted(writer->outConflict))
+ 		{
+ 			if (SxactCommittedBefore(writer->outConflict, writer)
+ 				&& SxactCommittedBefore(writer->outConflict, reader))
+ 				/* The out side of the pivot committed first. */
+ 				failure = true;
+ 		}
+ 		else
+ 		{
+ 			if (writer->outConflict->inConflict == writer->outConflict)
+ 				/* Self-reference will prevent checking at commit. */
+ 				failure = true;
+ 		}
+ 	}
+ 
+ 	if (reader->outConflict != writer
+ 		&& reader->inConflict != InvalidSerializableXact
+ 		&& !(reader->inConflict->rolledBack))
+ 	{
+ 		/* The reader is or is becoming a pivot. */
+ 		if (SxactIsCommitted(writer))
+ 		{
+ 			if (SxactCommittedBefore(writer, reader)
+ 				&& (reader->inConflict == reader
+ 					|| SxactCommittedBefore(writer, reader->inConflict)))
+ 				/* The out side committed first, as far as we can tell. */
+ 				failure = true;
+ 		}
+ 		else if (writer->inConflict != InvalidSerializableXact
+ 				 && writer->inConflict != reader)
+ 			/* Self-reference will prevent checking at commit. */
+ 			failure = true;
+ 	}
+ 
+ 	if (failure)
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+ 				 errhint("The transaction might succeed if retried.")));
+ }
+ 
+ /*
+  * PreCommit_CheckForSerializableConflicts
+  *		Check for dangerous structures in a serializable transaction
+  *		at commit.
+  *
+  * We're checking for a dangerous structure as each conflict is recorded.
+  * The only way we could have a problem at commit is if this is the "out"
+  * side of a pivot, and neither the "in" side or the pivot itself has yet
+  * committed.
+  */
+ void
+ PreCommit_CheckForSerializationFailure(void)
+ {
+ 	if (MySerializableXact == InvalidSerializableXact)
+ 		return;
+ 
+ 	Assert(IsXactIsoLevelFullySerializable);
+ 
+ 	LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ 
+ 	/*
+ 	 * Checking at conflict detection should only allow self-reference in if
+ 	 * this transaction is on the on the out side of a pivot, so
+ 	 * self-reference is OK here.
+ 	 */
+ 	if (MySerializableXact->inConflict != InvalidSerializableXact
+ 		&& MySerializableXact->inConflict != MySerializableXact
+ 		&& !(MySerializableXact->inConflict->rolledBack)
+ 	 && MySerializableXact->inConflict->inConflict != InvalidSerializableXact
+ 		&& !SxactIsCommitted(MySerializableXact->inConflict)
+ 		&& !SxactIsCommitted(MySerializableXact->inConflict->inConflict))
+ 	{
+ 		MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ 				 errmsg("could not serialize access due to read/write dependencies among transactions"),
+ 				 errhint("The transaction might succeed if retried.")));
+ 	}
+ 
+ 	MySerializableXact->finishedBefore = ShmemVariableCache->nextXid;
+ 	LWLockRelease(SerializableXactHashLock);
+ }
*** a/src/backend/utils/adt/lockfuncs.c
--- b/src/backend/utils/adt/lockfuncs.c
***************
*** 17,22 ****
--- 17,23 ----
  #include "miscadmin.h"
  #include "storage/proc.h"
  #include "utils/builtins.h"
+ #include "storage/predicate.h"
  
  
  /* This must match enum LockTagType! */
***************
*** 32,42 **** static const char *const LockTagTypeNames[] = {
--- 33,52 ----
  	"advisory"
  };
  
+ /* This must match enum PredicateLockTargetType (predicate.h) */
+ static const char *const PredicateLockTagTypeNames[] = {
+ 	"relation",
+ 	"page",
+ 	"tuple"
+ };
+ 
  /* Working status for pg_lock_status */
  typedef struct
  {
  	LockData   *lockData;		/* state data from lmgr */
  	int			currIdx;		/* current PROCLOCK index */
+ 	PredicateLockData *predLockData;	/* state data for pred locks */
+ 	int			predLockIdx;	/* current index for pred lock */
  } PG_Lock_Status;
  
  
***************
*** 69,74 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 79,85 ----
  	FuncCallContext *funcctx;
  	PG_Lock_Status *mystatus;
  	LockData   *lockData;
+ 	PredicateLockData *predLockData;
  
  	if (SRF_IS_FIRSTCALL())
  	{
***************
*** 126,131 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 137,144 ----
  
  		mystatus->lockData = GetLockStatusData();
  		mystatus->currIdx = 0;
+ 		mystatus->predLockData = GetPredicateLockStatusData();
+ 		mystatus->predLockIdx = 0;
  
  		MemoryContextSwitchTo(oldcontext);
  	}
***************
*** 303,308 **** pg_lock_status(PG_FUNCTION_ARGS)
--- 316,385 ----
  		SRF_RETURN_NEXT(funcctx, result);
  	}
  
+ 	/*
+ 	 * Have returned all regular locks. Now start on the SIREAD predicate
+ 	 * locks.
+ 	 */
+ 	predLockData = mystatus->predLockData;
+ 	if (mystatus->predLockIdx < predLockData->nelements)
+ 	{
+ 		PREDICATELOCKTARGETTAG *predTag = &(predLockData->locktags[mystatus->predLockIdx]);
+ 		SERIALIZABLEXACT *xact = &(predLockData->xacts[mystatus->predLockIdx]);
+ 		Datum		values[14];
+ 		bool		nulls[14];
+ 		HeapTuple	tuple;
+ 		Datum		result;
+ 
+ 		mystatus->predLockIdx++;
+ 
+ 		/*
+ 		 * Form tuple with appropriate data.
+ 		 */
+ 		MemSet(values, 0, sizeof(values));
+ 		MemSet(nulls, false, sizeof(nulls));
+ 
+ 		/* lock type */
+ 		PredicateLockTargetType lockType = GET_PREDICATELOCKTARGETTAG_TYPE(*predTag);
+ 
+ 		values[0] = CStringGetTextDatum(PredicateLockTagTypeNames[lockType]);
+ 
+ 		/* lock target */
+ 		values[1] = GET_PREDICATELOCKTARGETTAG_DB(*predTag);
+ 		values[2] = GET_PREDICATELOCKTARGETTAG_RELATION(*predTag);
+ 		if (lockType == PREDLOCKTAG_TUPLE)
+ 			values[4] = GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag);
+ 		else
+ 			nulls[4] = true;
+ 		if ((lockType == PREDLOCKTAG_TUPLE) ||
+ 			(lockType == PREDLOCKTAG_PAGE))
+ 			values[3] = GET_PREDICATELOCKTARGETTAG_PAGE(*predTag);
+ 		else
+ 			nulls[3] = true;
+ 
+ 		/* these fields are targets for other types of locks */
+ 		nulls[5] = true;		/* virtualxid */
+ 		nulls[6] = true;		/* transactionid */
+ 		nulls[7] = true;		/* classid */
+ 		nulls[8] = true;		/* objid */
+ 		nulls[9] = true;		/* objsubid */
+ 
+ 		/* lock holder */
+ 		values[10] = VXIDGetDatum(xact->tag.vxid.backendId,
+ 								  xact->tag.vxid.localTransactionId);
+ 		nulls[11] = true;		/* pid */
+ 
+ 		/*
+ 		 * Lock mode. Currently all predicate locks are SIReadLocks, which are
+ 		 * always held (never waiting)
+ 		 */
+ 		values[12] = CStringGetTextDatum("SIReadLock");
+ 		values[13] = BoolGetDatum(true);
+ 
+ 		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ 		result = HeapTupleGetDatum(tuple);
+ 		SRF_RETURN_NEXT(funcctx, result);
+ 	}
+ 
  	SRF_RETURN_DONE(funcctx);
  }
  
*** a/src/backend/utils/adt/ri_triggers.c
--- b/src/backend/utils/adt/ri_triggers.c
***************
*** 3308,3314 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan,
  	/*
  	 * In READ COMMITTED mode, we just need to use an up-to-date regular
  	 * snapshot, and we will see all rows that could be interesting. But in
! 	 * SERIALIZABLE mode, we can't change the transaction snapshot. If the
  	 * caller passes detectNewRows == false then it's okay to do the query
  	 * with the transaction snapshot; otherwise we use a current snapshot, and
  	 * tell the executor to error out if it finds any rows under the current
--- 3308,3314 ----
  	/*
  	 * In READ COMMITTED mode, we just need to use an up-to-date regular
  	 * snapshot, and we will see all rows that could be interesting. But in
! 	 * xact-snapshot-based modes, we can't change the transaction snapshot. If the
  	 * caller passes detectNewRows == false then it's okay to do the query
  	 * with the transaction snapshot; otherwise we use a current snapshot, and
  	 * tell the executor to error out if it finds any rows under the current
***************
*** 3316,3322 **** ri_PerformCheck(RI_QueryKey *qkey, SPIPlanPtr qplan,
  	 * that SPI_execute_snapshot will register the snapshots, so we don't need
  	 * to bother here.
  	 */
! 	if (IsXactIsoLevelSerializable && detectNewRows)
  	{
  		CommandCounterIncrement();		/* be sure all my own work is visible */
  		test_snapshot = GetLatestSnapshot();
--- 3316,3322 ----
  	 * that SPI_execute_snapshot will register the snapshots, so we don't need
  	 * to bother here.
  	 */
! 	if (IsXactIsoLevelXactSnapshotBased && detectNewRows)
  	{
  		CommandCounterIncrement();		/* be sure all my own work is visible */
  		test_snapshot = GetLatestSnapshot();
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 59,64 ****
--- 59,65 ----
  #include "storage/bufmgr.h"
  #include "storage/standby.h"
  #include "storage/fd.h"
+ #include "storage/predicate.h"
  #include "tcop/tcopprot.h"
  #include "tsearch/ts_cache.h"
  #include "utils/builtins.h"
***************
*** 1654,1659 **** static struct config_int ConfigureNamesInt[] =
--- 1655,1671 ----
  	},
  
  	{
+ 		{"max_predicate_locks_per_transaction", PGC_POSTMASTER, LOCK_MANAGEMENT,
+ 			gettext_noop("Sets the maximum number of predicate locks per transaction."),
+ 			gettext_noop("The shared predicate lock table is sized on the assumption that "
+ 			  "at most max_predicate_locks_per_transaction * max_connections distinct "
+ 						 "objects will need to be locked at any one time.")
+ 		},
+ 		&max_predicate_locks_per_xact,
+ 		64, 10, INT_MAX, NULL, NULL
+ 	},
+ 
+ 	{
  		{"authentication_timeout", PGC_SIGHUP, CONN_AUTH_SECURITY,
  			gettext_noop("Sets the maximum allowed time to complete client authentication."),
  			NULL,
*** a/src/backend/utils/resowner/resowner.c
--- b/src/backend/utils/resowner/resowner.c
***************
*** 261,267 **** ResourceOwnerReleaseInternal(ResourceOwner owner,
--- 261,270 ----
  			 * the top of the recursion.
  			 */
  			if (owner == TopTransactionResourceOwner)
+ 			{
  				ProcReleaseLocks(isCommit);
+ 				ReleasePredicateLocks(isCommit);
+ 			}
  		}
  		else
  		{
*** a/src/backend/utils/time/snapmgr.c
--- b/src/backend/utils/time/snapmgr.c
***************
*** 37,44 ****
  
  
  /*
!  * CurrentSnapshot points to the only snapshot taken in a serializable
!  * transaction, and to the latest one taken in a read-committed transaction.
   * SecondarySnapshot is a snapshot that's always up-to-date as of the current
   * instant, even on a serializable transaction.  It should only be used for
   * special-purpose code (say, RI checking.)
--- 37,44 ----
  
  
  /*
!  * CurrentSnapshot points to the only snapshot taken in a xact-snapshot-based
!  * transaction; otherwise to the latest one taken.
   * SecondarySnapshot is a snapshot that's always up-to-date as of the current
   * instant, even on a serializable transaction.  It should only be used for
   * special-purpose code (say, RI checking.)
***************
*** 97,107 **** static int	RegisteredSnapshots = 0;
  bool		FirstSnapshotSet = false;
  
  /*
!  * Remembers whether this transaction registered a serializable snapshot at
   * start.  We cannot trust FirstSnapshotSet in combination with
!  * IsXactIsoLevelSerializable, because GUC may be reset before us.
   */
! static bool registered_serializable = false;
  
  
  static Snapshot CopySnapshot(Snapshot snapshot);
--- 97,107 ----
  bool		FirstSnapshotSet = false;
  
  /*
!  * Remembers whether this transaction registered a transaction-based snapshot at
   * start.  We cannot trust FirstSnapshotSet in combination with
!  * IsXactIsoLevelXactSnapshotBased, because GUC may be reset before us.
   */
! static bool registered_xact_snapshot = false;
  
  
  static Snapshot CopySnapshot(Snapshot snapshot);
***************
*** 130,150 **** GetTransactionSnapshot(void)
  		FirstSnapshotSet = true;
  
  		/*
! 		 * In serializable mode, the first snapshot must live until end of
! 		 * xact regardless of what the caller does with it, so we must
! 		 * register it internally here and unregister it at end of xact.
  		 */
! 		if (IsXactIsoLevelSerializable)
  		{
  			CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot,
  												TopTransactionResourceOwner);
! 			registered_serializable = true;
  		}
  
  		return CurrentSnapshot;
  	}
  
! 	if (IsXactIsoLevelSerializable)
  		return CurrentSnapshot;
  
  	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
--- 130,153 ----
  		FirstSnapshotSet = true;
  
  		/*
! 		 * In xact-snapshot-based isolation levels, the first snapshot must
! 		 * live until end of xact regardless of what the caller does with it,
! 		 * so we must register it internally here and unregister it at end of
! 		 * xact.
  		 */
! 		if (IsXactIsoLevelXactSnapshotBased)
  		{
  			CurrentSnapshot = RegisterSnapshotOnOwner(CurrentSnapshot,
  												TopTransactionResourceOwner);
! 			registered_xact_snapshot = true;
! 			if (IsXactIsoLevelFullySerializable)
! 				RegisterSerializableTransaction(CurrentSnapshot);
  		}
  
  		return CurrentSnapshot;
  	}
  
! 	if (IsXactIsoLevelXactSnapshotBased)
  		return CurrentSnapshot;
  
  	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
***************
*** 155,161 **** GetTransactionSnapshot(void)
  /*
   * GetLatestSnapshot
   *		Get a snapshot that is up-to-date as of the current instant,
!  *		even if we are executing in SERIALIZABLE mode.
   */
  Snapshot
  GetLatestSnapshot(void)
--- 158,164 ----
  /*
   * GetLatestSnapshot
   *		Get a snapshot that is up-to-date as of the current instant,
!  *		even if we are executing in xact-snapshot-based mode.
   */
  Snapshot
  GetLatestSnapshot(void)
***************
*** 515,527 **** void
  AtEarlyCommit_Snapshot(void)
  {
  	/*
! 	 * On a serializable transaction we must unregister our private refcount
! 	 * to the serializable snapshot.
  	 */
! 	if (registered_serializable)
  		UnregisterSnapshotFromOwner(CurrentSnapshot,
  									TopTransactionResourceOwner);
! 	registered_serializable = false;
  
  }
  
--- 518,530 ----
  AtEarlyCommit_Snapshot(void)
  {
  	/*
! 	 * On a xact-snapshot-based transaction we must unregister our private
! 	 * refcount to the xact snapshot.
  	 */
! 	if (registered_xact_snapshot)
  		UnregisterSnapshotFromOwner(CurrentSnapshot,
  									TopTransactionResourceOwner);
! 	registered_xact_snapshot = false;
  
  }
  
***************
*** 557,561 **** AtEOXact_Snapshot(bool isCommit)
  	SecondarySnapshot = NULL;
  
  	FirstSnapshotSet = false;
! 	registered_serializable = false;
  }
--- 560,564 ----
  	SecondarySnapshot = NULL;
  
  	FirstSnapshotSet = false;
! 	registered_xact_snapshot = false;
  }
*** a/src/include/access/heapam.h
--- b/src/include/access/heapam.h
***************
*** 82,89 **** extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
  extern bool heap_fetch(Relation relation, Snapshot snapshot,
  		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
  		   Relation stats_relation);
! extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer,
! 					   Snapshot snapshot, bool *all_dead);
  extern bool heap_hot_search(ItemPointer tid, Relation relation,
  				Snapshot snapshot, bool *all_dead);
  
--- 82,89 ----
  extern bool heap_fetch(Relation relation, Snapshot snapshot,
  		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
  		   Relation stats_relation);
! extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
! 					   Buffer buffer, Snapshot snapshot, bool *all_dead);
  extern bool heap_hot_search(ItemPointer tid, Relation relation,
  				Snapshot snapshot, bool *all_dead);
  
*** a/src/include/access/xact.h
--- b/src/include/access/xact.h
***************
*** 32,41 **** extern int	DefaultXactIsoLevel;
  extern int	XactIsoLevel;
  
  /*
!  * We only implement two isolation levels internally.  This macro should
!  * be used to check which one is selected.
   */
! #define IsXactIsoLevelSerializable (XactIsoLevel >= XACT_REPEATABLE_READ)
  
  /* Xact read-only state */
  extern bool DefaultXactReadOnly;
--- 32,45 ----
  extern int	XactIsoLevel;
  
  /*
!  * We implement three isolation levels internally.
!  * The two stronger ones use one snapshot per database transaction;
!  * the others use one snapshot per statement.
!  * Serializable uses predicate locks.
!  * These macros should be used to check which isolation level is selected.
   */
! #define IsXactIsoLevelXactSnapshotBased (XactIsoLevel >= XACT_REPEATABLE_READ)
! #define IsXactIsoLevelFullySerializable (XactIsoLevel == XACT_SERIALIZABLE)
  
  /* Xact read-only state */
  extern bool DefaultXactReadOnly;
*** a/src/include/catalog/pg_am.h
--- b/src/include/catalog/pg_am.h
***************
*** 49,54 **** CATALOG(pg_am,2601)
--- 49,55 ----
  	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
  	bool		amstorage;		/* can storage type differ from column type? */
  	bool		amclusterable;	/* does AM support cluster command? */
+ 	bool		ampredlocks;	/* does AM handle predicate locks? */
  	Oid			amkeytype;		/* type of data in index, or InvalidOid */
  	regproc		aminsert;		/* "insert this tuple" function */
  	regproc		ambeginscan;	/* "start new scan" function */
***************
*** 76,82 **** typedef FormData_pg_am *Form_pg_am;
   *		compiler constants for pg_am
   * ----------------
   */
! #define Natts_pg_am						26
  #define Anum_pg_am_amname				1
  #define Anum_pg_am_amstrategies			2
  #define Anum_pg_am_amsupport			3
--- 77,83 ----
   *		compiler constants for pg_am
   * ----------------
   */
! #define Natts_pg_am						27
  #define Anum_pg_am_amname				1
  #define Anum_pg_am_amstrategies			2
  #define Anum_pg_am_amsupport			3
***************
*** 89,124 **** typedef FormData_pg_am *Form_pg_am;
  #define Anum_pg_am_amsearchnulls		10
  #define Anum_pg_am_amstorage			11
  #define Anum_pg_am_amclusterable		12
! #define Anum_pg_am_amkeytype			13
! #define Anum_pg_am_aminsert				14
! #define Anum_pg_am_ambeginscan			15
! #define Anum_pg_am_amgettuple			16
! #define Anum_pg_am_amgetbitmap			17
! #define Anum_pg_am_amrescan				18
! #define Anum_pg_am_amendscan			19
! #define Anum_pg_am_ammarkpos			20
! #define Anum_pg_am_amrestrpos			21
! #define Anum_pg_am_ambuild				22
! #define Anum_pg_am_ambulkdelete			23
! #define Anum_pg_am_amvacuumcleanup		24
! #define Anum_pg_am_amcostestimate		25
! #define Anum_pg_am_amoptions			26
  
  /* ----------------
   *		initial contents of pg_am
   * ----------------
   */
  
! DATA(insert OID = 403 (  btree	5 1 t t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
! DATA(insert OID = 405 (  hash	1 1 f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
! DATA(insert OID = 783 (  gist	0 7 f f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
! DATA(insert OID = 2742 (  gin	0 5 f f f t t f f t f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
--- 90,126 ----
  #define Anum_pg_am_amsearchnulls		10
  #define Anum_pg_am_amstorage			11
  #define Anum_pg_am_amclusterable		12
! #define Anum_pg_am_ampredlocks			13
! #define Anum_pg_am_amkeytype			14
! #define Anum_pg_am_aminsert				15
! #define Anum_pg_am_ambeginscan			16
! #define Anum_pg_am_amgettuple			17
! #define Anum_pg_am_amgetbitmap			18
! #define Anum_pg_am_amrescan				19
! #define Anum_pg_am_amendscan			20
! #define Anum_pg_am_ammarkpos			21
! #define Anum_pg_am_amrestrpos			22
! #define Anum_pg_am_ambuild				23
! #define Anum_pg_am_ambulkdelete			24
! #define Anum_pg_am_amvacuumcleanup		25
! #define Anum_pg_am_amcostestimate		26
! #define Anum_pg_am_amoptions			27
  
  /* ----------------
   *		initial contents of pg_am
   * ----------------
   */
  
! DATA(insert OID = 403 (  btree	5 1 t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
! DATA(insert OID = 405 (  hash	1 1 f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
! DATA(insert OID = 783 (  gist	0 7 f f f t t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
! DATA(insert OID = 2742 (  gin	0 5 f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
*** a/src/include/storage/lwlock.h
--- b/src/include/storage/lwlock.h
***************
*** 27,32 ****
--- 27,36 ----
  #define LOG2_NUM_LOCK_PARTITIONS  4
  #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
  
+ /* Number of partitions the shared predicate lock tables are divided into */
+ #define LOG2_NUM_PREDICATELOCK_PARTITIONS  4
+ #define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+ 
  /*
   * We have a number of predefined LWLocks, plus a bunch of LWLocks that are
   * dynamically assigned (e.g., for shared buffers).  The LWLock structures
***************
*** 70,81 **** typedef enum LWLockId
  	RelationMappingLock,
  	AsyncCtlLock,
  	AsyncQueueLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
  	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
! 	NumFixedLWLocks = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
  
  	MaxDynamicLWLock = 1000000000
  } LWLockId;
--- 74,89 ----
  	RelationMappingLock,
  	AsyncCtlLock,
  	AsyncQueueLock,
+ 	SerializableXactHashLock,
+ 	SerializableFinishedListLock,
+ 	SerializablePredicateLockListLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
  	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
+ 	FirstPredicateLockMgrLock = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
! 	NumFixedLWLocks = FirstPredicateLockMgrLock + NUM_PREDICATELOCK_PARTITIONS,
  
  	MaxDynamicLWLock = 1000000000
  } LWLockId;
*** /dev/null
--- b/src/include/storage/predicate.h
***************
*** 0 ****
--- 1,174 ----
+ /*-------------------------------------------------------------------------
+  *
+  * predicate.h
+  *	  POSTGRES predicate locking definitions.
+  *
+  *
+  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  * $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #ifndef PREDICATE_H
+ #define PREDICATE_H
+ 
+ #include "access/htup.h"
+ #include "utils/snapshot.h"
+ 
+ /* GUC variables */
+ extern int	max_predicate_locks_per_xact;
+ 
+ /*
+  * The SERIALIZABLEXACTTAG struct identifies a serializable transaction.
+  */
+ typedef struct SERIALIZABLEXACTTAG
+ {
+ 	VirtualTransactionId vxid;	/* We always have one of these. */
+ } SERIALIZABLEXACTTAG;
+ 
+ /*
+  * Information needed for each serializable database transaction to support SSI techniques.
+  * TODO SSI: Should inConflict and outConflict be lists?  That would allow us to reduce
+  *			 false positives, *and* would allow us to guarantee that an immediate retry
+  *			 of a transaction would never fail on the exact same conflicts.
+  *			 The RAM doesn't look like it would be the limiting factor, but CPU time might
+  *			 be -- we should have baseline benchmarks before attempting this.
+  */
+ typedef struct SERIALIZABLEXACT
+ {
+ 	/* hash key */
+ 	SERIALIZABLEXACTTAG tag;
+ 
+ 	/* data */
+ 	struct SERIALIZABLEXACT *outConflict;		/* ptr to write transaction
+ 												 * whose data we couldn't
+ 												 * read. invalid means no
+ 												 * conflict; self-reference
+ 												 * means multiple or
+ 												 * committed. */
+ 	struct SERIALIZABLEXACT *inConflict;		/* ptr to read transaction
+ 												 * which couldn't see our
+ 												 * write. invalid means no
+ 												 * conflict; self-reference
+ 												 * means multiple or
+ 												 * committed. */
+ 	TransactionId topXid;		/* top level xid for the transaction, if one
+ 								 * exists */
+ 	TransactionId finishedBefore;		/* invalid means still running; else
+ 										 * the struct expires when no tags <
+ 										 * this. */
+ 	TransactionId xmin;			/* the transaction's snapshot xmin */
+ 	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
+ 	SHM_QUEUE	xids;			/* list of associated SERIALIZABLEXID objects */
+ 	SHM_QUEUE	finishedLink;	/* list link in
+ 								 * FinishedSerializableTransactions */
+ 	bool		rolledBack;		/* ignore conflicts when true; allows deferred
+ 								 * cleanup */
+ } SERIALIZABLEXACT;
+ 
+ 
+ typedef enum PredicateLockTargetType
+ {
+ 	PREDLOCKTAG_RELATION,
+ 	PREDLOCKTAG_PAGE,
+ 	PREDLOCKTAG_TUPLE
+ 	/* TODO Other types may be needed for index locking */
+ }	PredicateLockTargetType;
+ 
+ /*
+  * The PREDICATELOCKTARGETTAG struct is defined to fit into 16
+  * bytes with no padding.  Note that this would need adjustment if we were
+  * to widen Oid or BlockNumber to more than 32 bits.
+  */
+ typedef struct PREDICATELOCKTARGETTAG
+ {
+ 	uint32		locktag_field1; /* a 32-bit ID field */
+ 	uint32		locktag_field2; /* a 32-bit ID field */
+ 	uint32		locktag_field3; /* a 32-bit ID field */
+ 	uint16		locktag_field4; /* a 16-bit ID field */
+ 	uint16		locktag_field5; /* a 16-bit ID field */
+ } PREDICATELOCKTARGETTAG;
+ 
+ /*
+  * These macros define how we map logical IDs of lockable objects into
+  * the physical fields of PREDICATELOCKTARGETTAG.	Use these to set up values,
+  * rather than accessing the fields directly.  Note multiple eval of target!
+  *
+  * TODO SSI: If we always use the same fields for the same type of value,
+  * we should rename these.	Holding off until it's clear there are no exceptions.
+  * Since indexes are relations with blocks and tuples, it's looking likely that
+  * the rename will be possible.  If not, we may need to divide the last field
+  * and use part of it for a target type, so that we know how to interpret the
+  * data..
+  */
+ #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = InvalidBlockNumber, \
+ 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = (blocknum), \
+ 	 (locktag).locktag_field4 = InvalidOffsetNumber, \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+ 	((locktag).locktag_field1 = (dboid), \
+ 	 (locktag).locktag_field2 = (reloid), \
+ 	 (locktag).locktag_field3 = (blocknum), \
+ 	 (locktag).locktag_field4 = (offnum), \
+ 	 (locktag).locktag_field5 = 0)
+ 
+ #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
+ 	((locktag).locktag_field1)
+ #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
+ 	((locktag).locktag_field2)
+ #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
+ 	((locktag).locktag_field3)
+ #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
+ 	((locktag).locktag_field4)
+ #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)							 \
+ 	(((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
+ 	 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
+ 	  PREDLOCKTAG_RELATION))
+ 
+ typedef struct PredicateLockData
+ {
+ 	int			nelements;
+ 	PREDICATELOCKTARGETTAG *locktags;
+ 	SERIALIZABLEXACT *xacts;
+ } PredicateLockData;
+ 
+ /*
+  * function prototypes
+  */
+ 
+ /* housekeeping for shared memory predicate lock structures */
+ extern void InitPredicateLocks(void);
+ extern Size PredicateLockShmemSize(void);
+ 
+ /* predicate lock reporting */
+ extern PredicateLockData *GetPredicateLockStatusData(void);
+ 
+ /* predicate lock maintenance */
+ extern void RegisterSerializableTransaction(const Snapshot snapshot);
+ extern void PredicateLockRelation(const Relation relation);
+ extern void PredicateLockPage(const Relation relation, const BlockNumber blkno);
+ extern void PredicateLockTuple(const Relation relation, const HeapTuple tuple);
+ extern void PredicateLockPageSplit(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
+ extern void PredicateLockPageCombine(const Relation relation, const BlockNumber oldblkno, const BlockNumber newblkno);
+ extern void ReleasePredicateLocks(const bool isCommit);
+ 
+ /* conflict detection (may also trigger rollback) */
+ extern void CheckForSerializableConflictOut(const bool valid, const Relation relation, const HeapTuple tuple, const Buffer buffer);
+ extern void CheckForSerializableConflictIn(const Relation relation, const HeapTuple tuple, const Buffer buffer);
+ 
+ /* final rollback checking */
+ extern void PreCommit_CheckForSerializationFailure(void);
+ 
+ #endif   /* PREDICATE_H */
*** a/src/include/storage/shmem.h
--- b/src/include/storage/shmem.h
***************
*** 70,74 **** extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem);
--- 70,75 ----
  extern Pointer SHMQueueNext(SHM_QUEUE *queue, SHM_QUEUE *curElem,
  			 Size linkOffset);
  extern bool SHMQueueEmpty(SHM_QUEUE *queue);
+ extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
  
  #endif   /* SHMEM_H */
*** a/src/test/regress/GNUmakefile
--- b/src/test/regress/GNUmakefile
***************
*** 135,140 **** tablespace-setup:
--- 135,157 ----
  
  
  ##
+ ## Prepare for dtester tests
+ ##
+ pg_dtester.py: pg_dtester.py.in GNUmakefile $(top_builddir)/src/Makefile.global
+ 	sed -e 's,@bindir@,$(bindir),g' \
+ 	    -e 's,@libdir@,$(libdir),g' \
+ 	    -e 's,@pkglibdir@,$(pkglibdir),g' \
+ 	    -e 's,@datadir@,$(datadir),g' \
+ 	    -e 's/@VERSION@/$(VERSION)/g' \
+ 	    -e 's/@host_tuple@/$(host_tuple)/g' \
+ 	    -e 's,@GMAKE@,$(MAKE),g' \
+ 	    -e 's/@enable_shared@/$(enable_shared)/g' \
+ 	    -e 's/@GCC@/$(GCC)/g' \
+ 	  $< >$@
+ 	chmod a+x $@
+ 
+ 
+ ##
  ## Run tests
  ##
  
***************
*** 152,157 **** installcheck-parallel: all
--- 169,179 ----
  standbycheck: all
  	$(pg_regress_call) --psqldir=$(PSQLDIR) --schedule=$(srcdir)/standby_schedule --use-existing
  
+ dcheck: pg_dtester.py
+ 	./pg_dtester.py --temp-install --top-builddir=$(top_builddir) \
+         --multibyte=$(MULTIBYTE) $(MAXCONNOPT) $(NOLOCALE)
+ 
+ 
  # old interfaces follow...
  
  runcheck: check
*** /dev/null
--- b/src/test/regress/pg_dtester.py.in
***************
*** 0 ****
--- 1,1626 ----
+ #!/usr/bin/python
+ 
+ #-------------------------------------------------------------------------
+ #
+ # dtester.py.in
+ #
+ #	 Sample test suite running two concurrent transactions, showing
+ #    off some capabilities of dtester.
+ #
+ # Copyright (c) 2006-2010, Markus Wanner
+ #
+ #-------------------------------------------------------------------------
+ 
+ import re, os, sys, getopt
+ from twisted.internet import defer, reactor
+ from twisted.python import failure
+ 
+ from dtester.events import EventMatcher, EventSource, Event, \
+ 	ProcessOutputEvent, ProcessErrorEvent, ProcessEndedEvent
+ from dtester.exceptions import TestAborted, TestFailure
+ from dtester.test import TestSuite, BaseTest, SyncTest
+ from dtester.reporter import StreamReporter, CursesReporter
+ from dtester.runner import Runner, Timeout
+ 
+ # ******  definition of tests and suites  ***********************************
+ 
+ class InstallationSuite(TestSuite):
+ 
+ 	setUpDescription = "creating temporary installation"
+ 	tearDownDescription = "removing temporary installation"
+ 
+ 	needs = (('shell', "IShell or something"),)
+ 
+ 	def setUp(self):
+ 		# inherit getConfig from the shell
+ 		setattr(self, 'getConfig', self.shell.getConfig)
+ 		setattr(self, 'runCommand', self.shell.runCommand)
+ 		setattr(self, 'recursive_remove', self.shell.recursive_remove)
+ 
+ 		# (re) create an installation directory
+ 		self.pg_inst_dir = self.shell.getConfig('inst_dir')
+ 		if os.path.exists(self.pg_inst_dir):
+ 			self.shell.recursive_remove(self.pg_inst_dir)
+ 		os.mkdir(self.pg_inst_dir)
+ 
+ 		# install into that directory
+ 		proc = self.shell.runCommand('make', 'make',
+ 			args=['make', '-C', self.shell.getConfig('top-builddir'),
+ 				  'DESTDIR=%s' % self.pg_inst_dir, 'install',
+ 				  'with_perl=no', 'with_python=no'],
+ 			lineBasedOutput=True)
+ 
+ 		d = self.waitFor(proc, EventMatcher(ProcessEndedEvent))
+ 		d.addCallback(self.makeTerminated)
+ 		proc.start()
+ 
+ 		# FIXME: how to properly handle these?
+ 		self.shell.addEnvPath(self.shell.getConfig('bindir'))
+ 		self.shell.addEnvLibraryPath(self.shell.getConfig('libdir'))
+ 		return d
+ 
+ 	def makeTerminated(self, event):
+ 		if event.exitCode != 0:
+ 			raise Exception("Initdb returned %d" % event.exitCode)
+ 		else:
+ 			return True
+ 
+ 	def tearDown(self):
+ 		# The installation procedure should be able to simply override any
+ 		# formerly installed files, so we save the time to clean up the
+ 		# installation directory.
+ 		return
+ 
+ 
+ class InitdbSuite(TestSuite):
+ 
+ 	args = (('number', int), )
+ 	needs = (('shell', "IShell or something"),)
+ 
+ 	def setUpDescription(self):
+ 		return "initializing database system %d" % self.number
+ 
+ 	def tearDownDescription(self):
+ 		return "removing database system %d" % self.number
+ 
+ 	def getNumber(self):
+ 		return self.number
+ 
+ 	def getDir(self):
+ 		return self.dbdir
+ 
+ 	def setUp(self):
+ 		self.dbdir = "%s%d" % \
+ 			(self.shell.getConfig('pgdata_prefix'), self.number)
+ 		proc = self.shell.runCommand(
+ 				'initdb-%d' % self.number,
+ 				'initdb', args = [
+ 				'initdb', '-D', self.dbdir,
+ 				'-A', 'trust', '--noclean'],
+ 				lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		proc.addHook(EventMatcher(ProcessEndedEvent),
+ 					 self.initdb_terminated, d)
+ 		proc.start()
+ 		return d
+ 
+ 	def initdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("Initdb returned %d" % event.exitCode))
+ 		else:
+ 			d.callback(True)
+ 
+ 	def tearDown(self):
+ 		self.shell.recursive_remove(
+ 			"%s%d" % (self.shell.getConfig('pgdata_prefix'), self.number))
+ 
+ 
+ class PostmasterSuite(TestSuite):
+ 
+ 	needs = (('shell', "IShell or something"),
+ 			 ('dbdir', "IDatabaseDir"),)
+ 
+ 	def setUpDescription(self):
+ 		return "starting database system %d" % self.dbdir.getNumber()
+ 
+ 	def tearDownDescription(self):
+ 		return "stopping database system %d" % self.dbdir.getNumber()
+ 
+ 	def getPort(self):
+ 		return self.port
+ 
+ 	def setUp(self):
+ 		setattr(self, 'getNumber', self.dbdir.getNumber)
+ 
+ 		self.port = self.shell.getConfig('temp-port') + self.dbdir.getNumber()
+ 
+ 		args = ['postmaster', '-d5',
+ 					'-D', self.dbdir.getDir(),
+ 					'-i', '-p', str(self.port)]
+ 		if self.shell.getConfig('enable_cassert'):
+ 			args += "-A1"
+ 
+ 		self.postmaster = self.shell.runCommand(
+ 			'postmaster%d' % self.dbdir.getNumber(),
+ 			'postmaster',
+ 			args = args,
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.readyHook = \
+ 			self.postmaster.addHook(EventMatcher(ProcessErrorEvent,
+ 				"database system is ready to accept connections"),
+ 				self.postmaster_ready, d)
+ 
+ 		self.unexpectedTerminationHook = \
+ 		  self.postmaster.addHook(EventMatcher(ProcessEndedEvent),
+ 								  self.postmaster_terminated)
+ 		self.postmaster.start()
+ 		return d
+ 
+ 	def postmaster_ready(self, event, d):
+ 		# it's sufficient if we're called once
+ 		self.postmaster.removeHook(self.readyHook)
+ 		d.callback(None)
+ 
+ 	def postmaster_terminated(self, event):
+ 		exitCode = 'undef'
+ 		if hasattr(event, 'exitCode'):
+ 			exitCode = event.exitCode
+ 		elif hasattr(event, 'data'):
+ 			exitCode = repr(event.data)
+ 		self.abort("postmaster %d unexpectedly terminated (exit code %s)" % \
+ 			(self.dbdir.getNumber(), exitCode))
+ 
+ 	def tearDown(self):
+ 		self.postmaster.removeHook(self.unexpectedTerminationHook)
+ 		if not self.aborted:
+ 			d = defer.Deferred()
+ 			self.postmaster.addHook(EventMatcher(ProcessEndedEvent),
+ 									lambda event: d.callback(None))
+ 			self.postmaster.stop()
+ 			return d
+ 		else:
+ 			return True
+ 
+ 
+ class TestDatabaseSuite(TestSuite):
+ 
+ 	args = (('dbname', str),)
+ 	needs = (('shell', "IShell or something"),
+ 			 ('pg', "IPostmaster"),)
+ 
+ 	def setUpDescription(self):
+ 		return "creating database %s at server %d" % \
+ 						(self.dbname, self.pg.getNumber())
+ 
+ 	def tearDownDescription(self):
+ 		return "not (!) dropping database %s at server %d" % \
+ 						(self.dbname, self.pg.getNumber())
+ 
+ 	def getDbname(self):
+ 		return self.dbname
+ 
+ 	def setUp(self):
+ 		setattr(self, "getPort", self.pg.getPort)
+ 		setattr(self, "getNumber", self.pg.getNumber)
+ 
+ 		self.proc = self.shell.runCommand(
+ 			'createdb%d' % self.pg.getNumber(),
+ 			'createdb',
+ 			args = ['createdb',
+ 					'-p', str(self.getPort()), self.dbname],
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.proc.addHook(EventMatcher(ProcessEndedEvent),
+ 						  self.createdb_terminated, d)
+ 		self.proc.start()
+ 		return d
+ 
+ 	def createdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("createdb terminated with code %d" % \
+ 				event.exitCode))
+ 		else:
+ 			d.callback(None)
+ 
+ 	def tearDown(self):
+ 		if self.pg.aborted:
+ 			return True
+ 
+ 		# Hm.. this interferes with the postmaster suites, which need
+ 		# to be started and stopped several times on top of a test database,
+ 		# however, creating and dropping it certainly depends on a running
+ 		# postmaster. Not sure how to solve this, at the moment I'm just
+ 		# skipping cleanup, i.e. dropdb.
+ 		return True
+ 
+ 		self.proc = self.shell.runCommand(
+ 			'dropdb%d' % self.pg.getNumber(),
+ 			'dropdb',
+ 			args = ['dropdb',
+ 					'-p', str(self.getPort()), self.dbname],
+ 			lineBasedOutput=True)
+ 
+ 		d = defer.Deferred()
+ 		self.proc.addHook(EventMatcher(ProcessEndedEvent),
+ 						  self.dropdb_terminated, d)
+ 		self.proc.start()
+ 		return d
+ 
+ 	def dropdb_terminated(self, event, d):
+ 		if event.exitCode != 0:
+ 			d.errback(Exception("dropdb returned with %d" % \
+ 				event.exitCode))
+ 		else:
+ 			d.callback(None)
+ 
+ 
+ class SqlConnectionSuite(TestSuite):
+ 
+ 	args = (('dbname', str),)
+ 	needs = (('shell', "IShell or something"),
+ 			 ('db', "IPostmaster"))
+ 
+ 	def setUpDescription(self):
+ 		return "connecting to database %s at server %d" % \
+ 						(self.dbname, self.db.getNumber())
+ 	def tearDownDescription(self):
+ 		return "disconnecting from database %s at server %d" % \
+ 						(self.dbname, self.db.getNumber())
+ 
+ 	def getDbname(self):
+ 		return self.dbname
+ 
+ 	def setUp(self):
+ 		self.psql = self.shell.runCommand(
+ 			'psql%d' % self.db.getNumber(),
+ 			'psql',
+ 			args=['psql', '-AEn',
+ 				  '--pset=pager=off', '--pset=columns=0',
+ 				  '-p', str(self.db.getPort()),
+ 				  self.dbname])
+ 
+ 		# initialize the output buffer and attach a first output collector
+ 		# *before* the process is started.
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorDeferred = d
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector,
+ 			None, d)
+ 
+ 		# Mark as being in used, until we get to the commandline
+ 		self.inUse = True
+ 		self.workQueue = []
+ 
+ 		# also add a termination hook
+ 		self.unexpectedTerminationHook = self.psql.addHook(
+ 			EventMatcher(ProcessEndedEvent), self.psql_terminated)
+ 
+ 		# then schedule start of the psql process and return the deferred
+ 		# *before* starting the process.
+ 		reactor.callLater(0.0, self.psql.start)
+ 		return d
+ 
+ 	def psql_terminated(self, event):
+ 		exitCode = "undef"
+ 		if hasattr(event, 'exitCode'):
+ 			exitCode = event.exitCode
+ 		elif hasattr(event, 'data'):
+ 			exitCode = repr(event.data)
+ 
+ 		# If there's an outputCollectorHook, the abort method won't catch
+ 		# and we have to wait for the timeout to trigger, instead of
+ 		# acting on process termination. We thus save the outputCollector
+ 		# deferred and send it an errback with the failure.
+ 		if self.outputCollectorHook:
+ 			self.outputCollectorDeferred.errback( \
+ 				TestAborted("psql to server %d unexpectedly terminated (exit code %s)" % ( \
+ 					self.db.getNumber(), exitCode)))
+ 		self.abort(
+ 			"psql to server %d unexpectedly terminated (exit code %s)" % ( \
+ 				self.db.getNumber(), exitCode))
+ 
+ 	def tearDown(self):
+ 		self.psql.removeHook(self.unexpectedTerminationHook)
+ 
+ 		d = defer.Deferred()
+ 		self.psql.addHook(EventMatcher(ProcessEndedEvent),
+ 						  lambda event: d.callback(None))
+ 		reactor.callLater(0.0, self.psql.write, "\\q\n")
+ 		reactor.callLater(5.0, self.psql.stop)
+ 		return d
+ 
+ 	def outputCollector(self, event, query, d):
+ 		self.output_buffer += event.data
+ 
+ 		cmdprompt = self.dbname + '=#'
+ 		cpos = self.output_buffer.find(cmdprompt)
+ 
+ 		if cpos >= 0:
+ 			self.psql.removeHook(self.outputCollectorHook)
+ 			self.outputCollectorHook = False
+ 			result = self.output_buffer[:cpos]
+ 			self.output_buffer = self.output_buffer[cpos + len(cmdprompt):]
+ 			if len(self.output_buffer) > 0 and self.output_buffer != ' ':
+ 				print "rest: %s" % repr(self.output_buffer)
+ 			if d:
+ 				# remove the command prompt at the end
+ 				result = result[:cpos]
+ 
+ 				if query:
+ 					# remove the query string at the beginning
+ 					query_len = len(query)
+ 					if result[:query_len] != query:
+ 						raise Exception("Query not found at beginning of psql answer.")
+ 
+ 					result = result[query_len:]
+ 					while (len(result) > 1) and (result[0] in ("\n", "\r", " ")):
+ 						result = result[1:]
+ 				reactor.callLater(0.0, d.callback, result)
+ 
+ 			self.inUse = False
+ 			if len(self.workQueue) > 0:
+ 				assert not self.inUse
+ 				job = self.workQueue.pop()
+ 				d1 = job['method'](*job['args'])
+ 				d1.chainDeferred(job['deferred'])
+ 
+ 	def query(self, query):
+ 		if self.inUse:
+ 			d = defer.Deferred()
+ 			self.workQueue.append({'deferred': d,
+ 								   'method': self.query,
+ 								   'args': (query,)})
+ 			return d
+ 
+ 		assert not self.inUse
+ 		assert not self.outputCollectorHook
+ 
+ 		self.inUse = True
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector, query, d)
+ 		d.addCallback(self.parseQueryResult)
+ 
+ 		# defer writing to the process, so that the caller has the
+ 		# opportunity to add callbacks to the deferred we return.
+ 		reactor.callLater(0.0, self.psql.write, query + "\n")
+ 
+ 		return d
+ 
+ 	def parseQueryResult(self, result):
+ 		rawlines = result.split('\n')
+ 
+ 		lines = []
+ 		for line in rawlines:
+ 			line = line.strip()
+ 			if line.startswith("ROLLBACK"):
+ 				raise Exception("transaction rolled back (%s)" % query)
+ 			if line.startswith("message type"):
+ 				raise Exception("protocol error: %s" % line)
+ 			if len(line) > 0 and not line.startswith("NOTICE:") \
+ 				    and not line.startswith("ROLLBACK"):
+ 				lines.append(line)
+ 
+ 		try:
+ 			assert len(lines) >= 2
+ 
+ 			lines = map(lambda x: x.strip(), lines)
+ 			headLine = lines[0]
+ 			tailLine = lines[-1]
+ 
+ 			fields = headLine.split('|')
+ 			rows = []
+ 			for row in lines[1:-1]:
+ 				attrs = row.split('|')
+ 				assert len(attrs) == len(fields)
+ 				x = {}
+ 				for i in range(len(attrs)):
+ 					x[fields[i]] = attrs[i].strip()
+ 				rows.append(x)
+ 
+ 			x = re.compile("\((\d+) rows?\)").search(tailLine)
+ 			if x:
+ 				if not int(x.group(1)) == len(rows):
+ 					raise Exception("number of rows doesn't match: %s vs %d for: '%s'" % (
+ 						x.group(1), len(rows), lines))
+ 			else:
+ 				raise Exception("final number of rows line doesn't match.\n------------\n%s\n---------------\n" % lines)
+ 			return rows
+ 		except Exception, e:
+ 			import traceback
+ 			print "error parsing query result: %s" % e
+ 			traceback.print_exc()
+ 			raise e
+ 			# return []
+ 
+ 	def operation(self, query, expResult=None):
+ 		if self.inUse:
+ 			d = defer.Deferred()
+ 			self.workQueue.append({'deferred': d,
+ 								   'method': self.operation,
+ 								   'args': (query, expResult)})
+ 			return d
+ 
+ 		assert not self.inUse
+ 		assert not self.outputCollectorHook
+ 
+ 		self.inUse = True
+ 		self.output_buffer = ""
+ 		d = defer.Deferred()
+ 		self.outputCollectorDeferred = d
+ 		self.outputCollectorHook = self.psql.addHook(
+ 			EventMatcher(ProcessOutputEvent), self.outputCollector, query, d)
+ 		d.addCallback(self.checkQueryResult, query, expResult)
+ 
+ 		# defer writing to the process, so that the caller has the
+ 		# opportunity to add callbacks to the deferred we return.
+ 		reactor.callLater(0.0, self.psql.write, query + "\n")
+ 
+ 		return d
+ 
+ 	def checkQueryResult(self, result, query, expResult):
+ 		lines = []
+ 		for line in result.split("\n"):
+ 			line = line.strip()
+ 			if len(line) > 0 and not line.startswith("WARNING:") \
+ 							 and not line.startswith("NOTICE:"):
+ 				lines.append(line)
+ 		lines = "\n".join(lines)
+ 		if expResult:
+ 			if isinstance(expResult, str):
+ 				self.assertEqual(expResult, lines,
+ 					"didn't get expected result for query '%s'" % query)
+ 			elif isinstance(expResult, list):
+ 				if not lines in expResult:
+ 					raise TestFailure("didn't get expected result",
+ 									   "no result matches, got:\n%s\nfor query: '%s'\n" % (lines, query))
+ 		return lines
+ 
+ 
+ class TestDatabaseConnection(BaseTest):
+ 
+ 	needs = (('conn', "ISqlConnection"),)
+ 
+ 	description = "database connection"
+ 
+ 	def run(self):
+ 		return self.conn.query("SELECT 1 AS test;")
+ 
+ 
+ # FIXME: that's not actually a test, but it modifies the database state
+ class PopulateTestDatabase(BaseTest):
+ 
+ 	needs = (('conn', "ISqlConnection"),)
+ 
+ 	description = "populate test database"
+ 
+ 	def run(self):
+ 		conn = self.conn
+ 
+ 		# Create a test table for use in TestConcurrentUpdates and fill it
+ 		# with two test tuples.
+ 		d = conn.operation("CREATE TABLE test (i int PRIMARY KEY, t text);",
+ 						   "CREATE TABLE")
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (5, 'apple');",
+ 			"INSERT 0 1"))
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (7, 'pear');",
+ 			"INSERT 0 1"))
+ 		d.addCallback(lambda x: conn.operation(
+ 			"INSERT INTO test VALUES (11, 'banana');",
+ 			"INSERT 0 1"))
+ 		return d
+ 
+ 
+ class PermutationTest(SyncTest):
+ 	"""	Abstract class for testing a set of steps in all permutations of execution order.
+ 		This counts as a single test, although a subclass may accumulate counts which may be of
+ 		interest, and should therefore be shown regardless of success or failure of the test.
+ 	"""
+ 
+ 	# stepDictionary maps a step ID to a function to run for that step.
+ 	stepDictionary = {}
+ 
+ 	# stepThreading is a list of lists.
+ 	# All permutations of interleaving of steps from the sublists will be generated.
+ 	# Steps from within each sublist are kept in order; only the interleaving is variable.
+ 	stepThreading = [[]]
+ 
+ 	# Override this to provide any per-iteration (permutation) setup.
+ 	def setUpIteration(self, stepIdList):
+ 		pass
+ 
+ 	# Override this to provide any per-iteration (permutation) teardown.
+ 	def tearDownIteration(self, stepIdList):
+ 		pass
+ 
+ 	def runIterationStep(self, stepId):
+ 		p = self.stepDictionary[stepId]
+ 		p()
+ 
+ 	def runIterationSteps(self, stepIdList):
+ 		try:
+ 			self.setUpIteration(stepIdList)
+ 			for stepId in stepIdList:
+ 				self.runIterationStep(stepId)
+ 		finally:
+ 			self.tearDownIteration(stepIdList)
+ 
+ 	def runPermutations(self, a):
+ 		self.runPermutations_recurse([], a)
+ 
+ 	def runPermutations_recurse(self, p, a):
+ 		found = False
+ 		for i in range(len(a)):
+ 			if len(a[i]) > 0:
+ 				found = True
+ 				r = p[:]
+ 				b = a[:]
+ 				r.append(b[i][0])
+ 				b[i] = b[i][1:]
+ 				self.runPermutations_recurse(r, b)
+ 		if not found:
+ 			self.runIterationSteps(p)
+ 
+ 	# If the dictionary is set up in this method, there can be references
+ 	# to class methods and fields.
+ 	def populateStepDictionary(self):
+ 		pass
+ 
+ 	def run(self):
+ 		self.populateStepDictionary()
+ 		self.runPermutations(self.stepThreading)
+ 		# The last two lines of output for the last entry seem to disappear???
+ 		print
+ 		print
+ 
+ 
+ class DummyPermutationTest(PermutationTest):
+ 	"""	Simple test of the PermutationTest abstract class.
+ 	"""
+ 
+ 	description = "simple test of the PermutationTest abstract class"
+ 
+ 	stepThreading = [['r1x','c1'],['r2x','c2']]
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		print
+ 
+ 	def printStepId(self, stepId):
+ 		print stepId,
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'r1x': lambda : self.printStepId('r1x'),
+ 			'c1': lambda : self.printStepId('c1'),
+ 			'r2x': lambda : self.printStepId('r2x'),
+ 			'c2': lambda : self.printStepId('c2')
+ 			}
+ 
+ 
+ class DatabasePermutationTest(PermutationTest):
+ 	""" Abstract class to provide framework for using an IterativeTest for database queries.
+ 	"""
+ 
+ 	commitRequiredCount = 0
+ 	commitRequiredOK = 0
+ 	rollbackRequiredCount = 0
+ 	rollbackRequiredOK = 0
+ 	commitPreferredCount = 0
+ 	commitPreferredOK = 0
+ 
+ 	serializationFailure = False
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return True
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return False
+ 
+ 	def countProgress(self, stepIdList):
+ 		if self.rollbackRequired(stepIdList):
+ 			self.rollbackRequiredCount += 1
+ 			if self.serializationFailure:
+ 				self.rollbackRequiredOK += 1
+ 		else:
+ 			if self.commitRequired(stepIdList):
+ 				self.commitRequiredCount += 1
+ 				if not self.serializationFailure:
+ 					self.commitRequiredOK += 1
+ 			else:
+ 				self.commitPreferredCount += 1
+ 				if not self.serializationFailure:
+ 					self.commitPreferredOK += 1
+ 
+ 	def runIterationSteps(self, stepIdList):
+ 		try:
+ 			self.setUpIteration(stepIdList)
+ 			for stepId in stepIdList:
+ 				self.runIterationStep(stepId)
+ 			self.countProgress(stepIdList)
+ 		finally:
+ 			self.tearDownIteration(stepIdList)
+ 
+ 	def tryOperation(self, conn, sql):
+ 		result = self.syncCall(10, conn.operation, sql),
+ 		for line in result:
+ 			if len(line) > 0 and line.startswith("ERROR:  could not serialize"):
+ 				self.serializationFailure = True
+ 			else:
+ 				if len(line) > 0 and line.startswith("ERROR:"):
+ 					raise TestFailure("failure other than serializable encountered: " + line, line)
+ 
+ 	def printStatistics(self):
+ 		print 'rollback required: ', self.rollbackRequiredOK, '/', self.rollbackRequiredCount
+ 		print 'commit required: ', self.commitRequiredOK, '/', self.commitRequiredCount
+ 		print 'commit preferred: ', self.commitPreferredOK, '/', self.commitPreferredCount
+ 
+ 	def run(self):
+ 		self.populateStepDictionary()
+ 		self.runPermutations(self.stepThreading)
+ 		self.printStatistics()
+ 		# The last two lines of output for the last entry seem to disappear???
+ 		print
+ 		print
+ 		if self.rollbackRequiredOK < self.rollbackRequiredCount:
+ 			raise TestFailure("serialization anomalies incorrectly allowed",
+ 				"Database integrity not protected.")
+ 		if self.commitRequiredOK < self.commitRequiredCount:
+ 			raise TestFailure("serialization failure occurred when it should not have",
+ 				"Transactions we thought we knew how to recognize as safe resulted in a rollback..")
+ 
+ 	def printStepResults(self, stepIdList):
+ 		if self.serializationFailure:
+ 			if self.commitRequired(stepIdList):
+ 				print 'rolled back ??'
+ 			else:
+ 				if not self.rollbackRequired(stepIdList):
+ 					print 'rolled back ?'
+ 				else:
+ 					print 'rolled back'
+ 		else:
+ 			if self.rollbackRequired(stepIdList):
+ 				print 'committed ***'
+ 			else:
+ 				print 'committed'
+ 
+ 
+ class SimpleWriteSkewTest(DatabasePermutationTest):
+ 	"""	Write skew test.
+ 		This test has two serializable transactions: one which updates all
+ 		'apple' rows to 'pear' and one which updates all 'pear' rows to
+ 		'apple'.  If these were serialized (run one at a time) either
+ 		value could be present, but not both.  One must be rolled back to
+ 		prevent the write skew anomaly.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "write skew test"
+ 
+ 	stepThreading = [['rwx1','c1'],['rwx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rwx1': lambda : self.tryOperation(self.conn1, "UPDATE test SET t = 'apple' WHERE t = 'pear';"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'rwx2': lambda : self.tryOperation(self.conn2, "UPDATE test SET t = 'pear' WHERE t = 'apple';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList,
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'apple' WHERE i = 5;", "UPDATE 1")
+ 		self.syncCall(10, self.conn1.operation, "UPDATE test SET t = 'pear' WHERE i = 7;", "UPDATE 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (stepIdList.index('c1') < stepIdList.index('rwx2')
+ 				or stepIdList.index('c2') < stepIdList.index('rwx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ReceiptReportTest(DatabasePermutationTest):
+ 	"""	Daily Report of Receipts test.
+ 		This test doesn't persist a bad state in the database; rather, it
+ 		provides a view of the data which is not consistent with any
+ 		order of execution of the serializable transactions.  It
+ 		demonstrates a situation where the deposit date for receipts could
+ 		be changed and a report of the closed day's receipts subsequently
+ 		run which will miss a receipt from the date which has been closed.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'),
+ 			 ('conn3', 'ISqlConnection'))
+ 
+ 	description = "daily report of receipts test"
+ 
+ 	stepThreading = [['rxwy1','c1'],['wx2','c2'],['rx3','ry3','c3']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rxwy1': lambda : self.tryOperation(self.conn1, "INSERT INTO receipt VALUES (3, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 4.00);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE ctl SET deposit_date = DATE '2008-12-23' WHERE k = 'receipt';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;"),
+ 			'rx3': lambda : self.tryOperation(self.conn3, "SELECT * FROM ctl WHERE k = 'receipt';"),
+ 			'ry3': lambda : self.tryOperation(self.conn3, "SELECT * FROM receipt WHERE deposit_date = DATE '2008-12-22';"),
+ 			'c3': lambda : self.tryOperation(self.conn3, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		print stepIdList,
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS ctl, receipt;")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE ctl (k text NOT NULL PRIMARY KEY, deposit_date date NOT NULL);")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO ctl VALUES ('receipt', DATE '2008-12-22');")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE receipt (receipt_no int NOT NULL PRIMARY KEY, deposit_date date NOT NULL, amount numeric(13,2));")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (1, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 1.00);")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO receipt VALUES (2, (SELECT deposit_date FROM ctl WHERE k = 'receipt'), 2.00);")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn3.operation, "BEGIN TRANSACTION READ ONLY ISOLATION LEVEL SERIALIZABLE READ ONLY;", "BEGIN")
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn3.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   (stepIdList.index('c1') < stepIdList.index('wx2')
+ 					and stepIdList.index('c1') < stepIdList.index('rx3'))
+ 				or (stepIdList.index('c2') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c2') < stepIdList.index('rx3'))
+ 				or (stepIdList.index('c3') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c3') < stepIdList.index('wx2'))
+ 				or (stepIdList.index('c2') < stepIdList.index('rxwy1')
+ 					and stepIdList.index('c3') < stepIdList.index('rxwy1'))
+ 				or (stepIdList.index('c1') < stepIdList.index('wx2')
+ 					and stepIdList.index('c3') < stepIdList.index('wx2'))
+ 				or (stepIdList.index('c1') < stepIdList.index('rx3')
+ 					and stepIdList.index('c2') < stepIdList.index('rx3')))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return ((stepIdList.index('c2') < stepIdList.index('c1')
+ 				and stepIdList.index('c2') < stepIdList.index('c3')
+ 				and stepIdList.index('rxwy1') < stepIdList.index('c2')
+ 				and stepIdList.index('rx3') < stepIdList.index('c1')
+ 				#############################################################
+ 				# The following test excludes some rows from rollback
+ 				# required for which we know our current SSI algorithm
+ 				# requires a rollback, but which don't, in fact, cause
+ 				# any anomaly.  If we determine that we can allow pivots
+ 				# in which conflictIn and conflictOut are separate and
+ 				# overlapping transactions, these can be committed.
+ 				# To include these permutations in the "rollback required"
+ 				# count, comment out the following line.
+ 				and stepIdList.index('c2') < stepIdList.index('rx3')
+ 				#############################################################
+ 				)
+ 
+ 				#############################################################
+ 				# An anomaly can't actually occur based on the following
+ 				# "or" clause, but we know that our algorithm can't
+ 				# currently detect that, because T2's conflictIn is set
+ 				# to a self-reference because of multiple conflicts.
+ 				# To count these in the "rollback required" list, uncomment
+ 				# this section; otherwise they are "commit preferred"..
+ 				# or (stepIdList.index('rxwy1') < stepIdList.index('c1')
+ 				#	and stepIdList.index('rxwy1') < stepIdList.index('c2')
+ 				#	and stepIdList.index('rxwy1') < stepIdList.index('c3')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c1')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c2')
+ 				#	and stepIdList.index('wx2') < stepIdList.index('c3')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c1')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c2')
+ 				#	and stepIdList.index('rx3') < stepIdList.index('c3')
+ 				#	)
+ 				#############################################################
+ 			   )
+ 
+ 
+ class TemporalRangeIntegrityTest(DatabasePermutationTest):
+ 	"""	Temporal range integrity test.
+ 		Snapshot integrity fails with simple referential integrity tests,
+ 		but those don't make for good demonstrations because people just
+ 		say that foreign key definitions should be used instead.  There
+ 		are many integrity tests which are conceptually very similar but
+ 		don't have built-in support which will fail when used in triggers.
+ 		This is intended to illustrate such cases.  It is obviously very
+ 		hard to exercise all these permutations when the code is actually
+ 		in a trigger; this test pulls what would normally be inside of
+ 		triggers out to the top level to control the permutations.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "temporal range integrity test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date <= DATE '2009-05-15' AND (exp_date IS NULL OR exp_date > DATE '2009-05-15');"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO offense VALUES (1, '123.45(1)a', DATE '2009-05-15');"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM offense WHERE statute_cite = '123.45(1)a' AND offense_date >= DATE '2008-01-01';"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM statute WHERE statute_cite = '123.45(1)a' AND eff_date = DATE '2008-01-01';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS statute, offense;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE statute (statute_cite text NOT NULL, eff_date date NOT NULL, exp_date date, CONSTRAINT statute_pkey PRIMARY KEY (statute_cite, eff_date));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO statute VALUES ('123.45(1)a', DATE '2008-01-01', NULL);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE offense (offense_no int NOT NULL, statute_cite text NOT NULL, offense_date date NOT NULL, CONSTRAINT offense_pkey PRIMARY KEY (offense_no));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ProjectManagerTest(DatabasePermutationTest):
+ 	"""	Project manager test.
+ 		Ensure that the person who is on the project as a manager
+ 		is flagged as a project manager in the person table.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "project manager test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM person WHERE person_id = 1 AND is_project_manager;"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO project VALUES (101, 'Build Great Wall', 1);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM project WHERE project_manager = 1;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE person SET is_project_manager = false WHERE person_id = 1;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS person, project;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE person (person_id int NOT NULL PRIMARY KEY, name text NOT NULL, is_project_manager bool NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO person VALUES (1, 'Robert Haas', true);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE project (project_no int NOT NULL PRIMARY KEY, description text NOT NULL, project_manager int NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ClassroomSchedulingTest(DatabasePermutationTest):
+ 	"""	Classroom scheduling test.
+ 		Ensure that the classroom is not scheduled more than once
+ 		for any moment in time.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "classroom scheduling test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:00' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:00';"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 13:00', TIMESTAMP WITH TIME ZONE '2010-04-01 14:00', 'Carol');"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT count(*) FROM room_reservation WHERE room_id = '101' AND start_time < TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' AND end_time > TIMESTAMP WITH TIME ZONE '2010-04-01 13:30';"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "UPDATE room_reservation SET start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 13:30', end_time = TIMESTAMP WITH TIME ZONE '2010-04-01 14:30' WHERE room_id = '101' AND start_time = TIMESTAMP WITH TIME ZONE '2010-04-01 10:00';"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS room_reservation;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE room_reservation (room_id text NOT NULL, start_time timestamp with time zone NOT NULL, end_time timestamp with time zone NOT NULL, description text NOT NULL, CONSTRAINT room_reservation_pkey PRIMARY KEY (room_id, start_time));", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO room_reservation VALUES ('101', TIMESTAMP WITH TIME ZONE '2010-04-01 10:00', TIMESTAMP WITH TIME ZONE '2010-04-01 11:00', 'Bob');", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('ry2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class TotalCashTest(DatabasePermutationTest):
+ 	"""	Total cash test.
+ 		Another famous test of snapshot isolation anomaly.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "total cash test"
+ 
+ 	stepThreading = [['wx1','rxy1','c1'],['wy2','rxy2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'wx1': lambda : self.tryOperation(self.conn1, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'checking';"),
+ 			'rxy1': lambda : self.tryOperation(self.conn1, "SELECT SUM(balance) FROM accounts;"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'wy2': lambda : self.tryOperation(self.conn2, "UPDATE accounts SET balance = balance - 200 WHERE accountid = 'savings';"),
+ 			'rxy2': lambda : self.tryOperation(self.conn2, "SELECT SUM(balance) FROM accounts;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS accounts;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE accounts (accountid text NOT NULL PRIMARY KEY, balance numeric not null);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO accounts VALUES ('checking', 600),('savings',600);", "INSERT 0 2")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('wy2')
+ 				or stepIdList.index('c2') < stepIdList.index('wx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class ReferentialIntegrityTest(DatabasePermutationTest):
+ 	"""	Referential integrity test.
+ 		The assumption here is that the application code issuing the SELECT
+ 		to test for the presence or absence of a related record would do the
+ 		right thing -- this script doesn't include that logic.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "referential integrity test"
+ 
+ 	stepThreading = [['rx1','wy1','c1'],['rx2','ry2','wx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'rx1': lambda : self.tryOperation(self.conn1, "SELECT i FROM a WHERE i = 1;"),
+ 			'wy1': lambda : self.tryOperation(self.conn1, "INSERT INTO b VALUES (1);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'rx2': lambda : self.tryOperation(self.conn2, "SELECT i FROM a WHERE i = 1;"),
+ 			'ry2': lambda : self.tryOperation(self.conn2, "SELECT a_id FROM b WHERE a_id = 1;"),
+ 			'wx2': lambda : self.tryOperation(self.conn2, "DELETE FROM a WHERE i = 1;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS a, b;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE a (i int PRIMARY KEY);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE b (a_id int);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO a VALUES (1);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('rx2')
+ 				or stepIdList.index('c2') < stepIdList.index('rx1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class RITriggerTest(DatabasePermutationTest):
+ 	"""	Referential integrity trigger test.
+ 	"""
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	description = "referential integrity trigger test"
+ 
+ 	stepThreading = [['wxry1','c1'],['r2','wyrx2','c2']]
+ 
+ 	def populateStepDictionary(self):
+ 		self.stepDictionary = {
+ 			'wxry1': lambda : self.tryOperation(self.conn1, "INSERT INTO child (parent_id) VALUES (0);"),
+ 			'c1': lambda : self.tryOperation(self.conn1, "COMMIT;"),
+ 			'r2': lambda : self.tryOperation(self.conn2, "SELECT TRUE;"),
+ 			'wyrx2': lambda : self.tryOperation(self.conn2, "DELETE FROM parent WHERE parent_id = 0;"),
+ 			'c2': lambda : self.tryOperation(self.conn2, "COMMIT;")
+ 			}
+ 
+ 	def setUpIteration(self, stepIdList):
+ 		self.serializationFailure = False
+ 		self.syncCall(10, self.conn1.operation, "DROP TABLE IF EXISTS parent, child;", "DROP TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE parent (parent_id SERIAL NOT NULL PRIMARY KEY);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TABLE child (child_id SERIAL NOT NULL PRIMARY KEY, parent_id INTEGER NOT NULL);", "CREATE TABLE")
+ 		self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_parent() RETURNS TRIGGER AS $body$\
+ BEGIN\
+   PERFORM TRUE FROM child WHERE parent_id = OLD.parent_id;\
+   IF FOUND THEN\
+     RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || OLD.parent_id || ' still referenced during ' || TG_OP;\
+   END IF;\
+   RETURN NULL;\
+ END;\
+ $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_parent AFTER UPDATE OR DELETE ON parent FOR EACH ROW EXECUTE PROCEDURE ri_parent();", "CREATE TRIGGER")
+ 		self.syncCall(10, self.conn1.operation, "CREATE OR REPLACE FUNCTION ri_child() RETURNS TRIGGER AS $body$\
+ BEGIN\
+   PERFORM TRUE FROM parent WHERE parent_id = NEW.parent_id;\
+   IF NOT FOUND THEN\
+     RAISE SQLSTATE '23503' USING MESSAGE = 'Parent ' || NEW.parent_id || ' does not exist during ' || TG_OP;\
+   END IF;\
+   RETURN NULL;\
+ END;\
+ $body$ LANGUAGE PLPGSQL VOLATILE;", "CREATE FUNCTION")
+ 		self.syncCall(10, self.conn1.operation, "CREATE TRIGGER ri_child AFTER INSERT OR UPDATE ON child FOR EACH ROW EXECUTE PROCEDURE ri_child();", "CREATE TRIGGER")
+ 		self.syncCall(10, self.conn1.operation, "INSERT INTO parent VALUES(0);", "INSERT 0 1")
+ 		self.syncCall(10, self.conn1.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		self.syncCall(10, self.conn2.operation, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "BEGIN")
+ 		print stepIdList,
+ 
+ 	# Override the normal method to allow failures generated by the trigger code
+ 	# to be considered "success".  Just so we can count things up.
+ 	def tryOperation(self, conn, sql):
+ 		result = self.syncCall(10, conn.operation, sql),
+ 		for line in result:
+ 			if len(line) > 0 and line.startswith("ERROR:  could not serialize"):
+ 				self.serializationFailure = True
+ 			else:
+ 				if (len(line) > 0 and line.startswith("ERROR:")
+ 				and len(line) > 0 and not line.startswith("ERROR:  Parent 0 ")):
+ 					raise TestFailure("failure other than serializable encountered: " + line, line)
+ 
+ 	def tearDownIteration(self, stepIdList):
+ 		self.syncCall(10, self.conn1.operation, "ROLLBACK;")
+ 		self.syncCall(10, self.conn2.operation, "ROLLBACK;")
+ 		self.printStepResults(stepIdList)
+ 
+ 	def commitRequired(self, stepIdList):
+ 		return (   stepIdList.index('c1') < stepIdList.index('r2')
+ 				or stepIdList.index('c2') < stepIdList.index('wxry1'))
+ 
+ 	def rollbackRequired(self, stepIdList):
+ 		return not self.commitRequired(stepIdList)
+ 
+ 
+ class TestTrueSerializabilityConcurrentUpdates(SyncTest):
+ 	""" Runs three transactions concurrently, each reading from what the
+ 		other writes in turn. Should raise a serialization failure, but
+ 		instead leads to wrong results, ATM.
+ 	"""
+ 
+ 	description = "concurrent updates"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'),
+ 			 ('conn3', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def readValueThenWrite(self, conn, readFromId, writeToId):
+ 		d = conn.query("SELECT t FROM test WHERE i = %d;" % readFromId)
+ 		d.addCallback(self.writeValueBack, conn, writeToId)
+ 		return d
+ 
+ 	def writeValueBack(self, result, conn, writeToId):
+ 		self.assertEqual(1, len(result),
+ 						 "expected exactly one result row")
+ 		row = result[0]
+ 		self.assertEqual(1, len(row),
+ 						 "expected exactly one column")
+ 		value = row['t']
+ 		d = conn.operation("UPDATE test SET t = '%s' WHERE i = %d;" % (value, writeToId),
+ 						   "UPDATE")
+ 		return d
+ 
+ 	def startConcurrentOperations(self):
+ 		d1 = self.readValueThenWrite(self.conn1, readFromId=5,  writeToId=7)
+ 		d2 = self.readValueThenWrite(self.conn2, readFromId=7,  writeToId=11)
+ 		d3 = self.readValueThenWrite(self.conn3, readFromId=11, writeToId=5)
+ 		return defer.DeferredList([d1, d2, d3],
+ 								  consumeErrors=True, fireOnOneErrback=True)
+ 
+ 	def run(self):
+ 		try:
+ 			self.sub_run()
+ 		finally:
+ 			self.syncCall(10, self.execOnAllConnections, "ROLLBACK;")
+ 
+ 	def sub_run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2,
+ 			self.conn3]
+ 
+ 		# begin a transaction on all three connections
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET")
+ 
+ 		# concurrently let each of the three transactions read a value and
+ 		# write that to another tuple, wait for all the UPDATEs to complete
+ 		# before trying to commit any of the transactions
+ 		self.syncCall(10, self.startConcurrentOperations)
+ 
+ 		# try to commit all three transactions (accepting both COMMIT or
+ 		# ERROR, we check the result later on).
+ 		self.syncCall(10, self.execOnAllConnections,
+ 			"COMMIT;", "COMMIT|ERROR");
+ 
+ 		# count the occurrance of each fruit
+ 		result = self.syncCall(10, self.conn1.query,
+ 			"SELECT t FROM test WHERE i IN (5, 7, 11);")
+ 		counters = {'banana': 0, 'apple': 0, 'pear': 0}
+ 		for row in result:
+ 			counters[row['t']] += 1
+ 
+ 		# you currently get one fruit each, as no transaction gets aborted,
+ 		# which is impossible if the transactions had been executed one
+ 		# after another.
+ 		if counters.values() == [1, 1, 1]:
+ 			raise TestFailure("conflict not detected",
+ 				"All transactions committed, so the conflict hasn't been detected.")
+ 
+ class TestTrueSerializabilityConcurrentInsert(BaseTest):
+ 	""" Runs two transactions, both doing an insert, first, then select
+ 		all the relevant rows (within the range 100 <= i < 110). We let the
+ 		first transaction commit before creating the cyclic dependency,
+ 		which forces transaction 2 to abort.
+ 	"""
+ 
+ 	description = "concurrent insert"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2]
+ 
+ 		# begin a transaction on all three connections
+ 		d = self.execOnAllConnections("BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		d.addCallback(lambda x:
+ 			self.execOnAllConnections(
+ 				"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET"))
+ 
+ 		# let transaction 1 do an insert (so it acquires a snapshot)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"INSERT INTO test (i, t) VALUES (101, 'orange');", "INSERT 0 1"))
+ 
+ 		# then same for transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"INSERT INTO test (i, t) VALUES (102, 'grapefruit');", "INSERT 0 1"))
+ 
+ 		# let transaction 1 read the relevant rows, so it acquires an SIREAD
+ 		# lock on the predicate. (The result is discarded).
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;"))
+ 
+ 		# then commit transaction 1 (which should still succeed)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# try to read all rows with the second transaction's snapshot (which
+ 		# doesn't see the update of transaction 1)
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 100 AND i < 110;"))
+ 
+ 		# With SSI in place, this should lock the same predicate with an
+ 		# SIREAD lock, which should bomb out on the orange (tuple i = 101)
+ 		# from transaction 1.
+ 		#
+ 		# dtester FIXME: Hm.. this could need some "expect to fail" help
+ 		#                from dtester
+ 		d.addCallback(self.checkResult)
+ 
+ 		# cleanup both transactions, especially in case of failure
+ 		d.addBoth(self.cleanup)
+ 
+ 		return d
+ 
+ 	def checkResult(self, result):
+ 		if not isinstance(result, failure.Failure):
+ 			raise TestFailure("conflict not detected",
+ 				"SELECT should raise a serialization error")
+ 		return result
+ 
+ 	def cleanup(self, result):
+ 		d = self.execOnAllConnections("ROLLBACK;")
+ 
+ 		# ignore errors above, but instead make sure we return the result
+ 		# we got here, especially if it was an error.
+ 		d.addBoth(lambda x: result)
+ 		return d
+ 
+ class TestTrueSerializabilityConcurrentInsert2(BaseTest):
+ 	""" Pretty similar to the above test, except that the first transaction
+ 		doesn't read (and thus predicate lock) the relevant rows. This still
+ 		leaves a possible serialization ordering, even if it doesn't match
+ 		the real commit ordering.
+ 
+ 		Uses rows 200 <= i < 210
+ 	"""
+ 
+ 	description = "concurrent insert"
+ 
+ 	needs = (('conn1', 'ISqlConnection'),
+ 			 ('conn2', 'ISqlConnection'))
+ 
+ 	def execOnAllConnections(self, sql, expRes=None):
+ 		deferreds = []
+ 		for conn in self.connections:
+ 			d = conn.operation(sql, expRes)
+ 			deferreds.append(d)
+ 
+ 		d = defer.DeferredList(deferreds,
+ 							   consumeErrors=True, fireOnOneErrback=True)
+ 		return d
+ 
+ 	def run(self):
+ 		self.connections = [
+ 			self.conn1,
+ 			self.conn2]
+ 
+ 		# begin a transaction on all three connections
+ 		d = self.execOnAllConnections("BEGIN;", "BEGIN")
+ 
+ 		# set their isolation level to SERIALIZABLE
+ 		d.addCallback(lambda x:
+ 			self.execOnAllConnections(
+ 				"SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;", "SET"))
+ 
+ 		# let transaction 1 do an insert (so it acquires a snapshot)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"INSERT INTO test (i, t) VALUES (201, 'orange');", "INSERT 0 1"))
+ 
+ 		# then same for transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"INSERT INTO test (i, t) VALUES (202, 'grapefruit');", "INSERT 0 1"))
+ 
+ 		# no SELECT here, so transaction 1 doesn't acquire any SIREAD lock
+ 
+ 		# then commit transaction 1 (which should succeed)
+ 		d.addCallback(lambda x:
+ 			self.conn1.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# try to read all rows with the second transaction's snapshot (which
+ 		# doesn't see the update of transaction 1)
+ 		d.addCallback(lambda x:
+ 			self.conn2.query("SELECT t FROM test WHERE i >= 200 AND i < 210;"))
+ 
+ 		# With SSI in place, this should lock the same predicate as abover
+ 		# with an SIREAD lock. This includes the row just written by the
+ 		# first transaction.
+ 		#
+ 		# As long as there are no other edges, this still leaves a possible
+ 		# serialization ordering: if we executed the second transaction
+ 		# *before* the first one, the second didn't see the 'orange' row
+ 		# inserted "later" by the first transaction. That's the result we
+ 		# expect.
+ 		d.addCallback(self.checkResult)
+ 
+ 		# commit transaction 2
+ 		d.addCallback(lambda x:
+ 			self.conn2.operation(
+ 				"COMMIT;", "COMMIT"))
+ 
+ 		# add a cleanup handler
+ 		d.addErrback(self.cleanup)
+ 
+ 		return d
+ 
+ 	def checkResult(self, result):
+ 		self.assertEqual(len(result), 1,
+ 			"Expected exactly one row, got %d (%s)" % (
+ 				len(result), repr(result)))
+ 		self.assertEqual(result[0], {"t": "grapefruit"},
+ 			"Expected to read the grapefruit row, but got %s" % (result[0],))
+ 
+ 		return result
+ 
+ 	def cleanup(self, result):
+ 		d = self.execOnAllConnections("ROLLBACK;")
+ 
+ 		# ignore errors above, but instead make sure we return the result
+ 		# we got here, especially if it was an error.
+ 		d.addBoth(lambda x: result)
+ 		return d
+ 
+ 
+ # ******  test running code  ************************************************
+ 
+ class Logger(object):
+ 	""" A simplistic logger that just writes it all into one single file.
+ 	"""
+ 	def __init__(self, logFileName):
+ 		self.logfile = open(logFileName, 'w')
+ 
+ 	def __del__(self):
+ 		self.logfile.close()
+ 
+ 	def callback(self, event):
+ 		self.logfile.write(str(event) + "\n")
+ 		self.logfile.flush()
+ 
+ def main(argv):
+ 	print "Postgres dtester suite                Copyright (c) 2004-2010, by Markus Wanner\n"
+ 
+ 	postgres_configure_args = "@configure_args@"
+ 
+ 	config = {
+ 			'temp-port': 65432,
+ 
+ 			# by default, use the same installation directory as make check
+ 			'inst_dir': os.path.join(os.getcwd(), 'tmp_check/install'),
+ 
+ 			# and a similar prefix
+ 			'pgdata_prefix': os.path.join(os.getcwd(), 'tmp_check/data-dtester'),
+ 			'logfile' : os.path.join(os.getcwd(), 'dtester.log'),
+ 
+ 			'enable_cassert': 'enable_cassert' in postgres_configure_args
+ 	}
+ 
+ 	try:
+ 		opts, args = getopt.getopt(argv,
+ 			"h",
+ 			["help", "temp-install", "top-builddir=", "temp-port=",
+ 			 "multibyte="])
+ 	except getopt.GetoptError:
+ 		usage()
+ 		sys.exit(2)
+ 
+ 	for opt, arg in opts:
+ 		if opt in ("-h", "--help"):
+ 			usage()
+ 			sys.exit()
+ 		elif opt in ("--temp-install"):
+ 			config["temp-install"] = True
+ 		elif opt in ("--temp-port"):
+ 			try:
+ 				arg = int(arg)
+ 				if arg >= 1024 and arg <= 65535:
+ 					config["temp-port"] = arg
+ 				else:
+ 					print "temp-port out of range."
+ 					sys.exit(2)
+ 			except ValueError:
+ 				print "Fatal: invalid temp-port specified"
+ 				sys.exit(2)
+ 		elif opt in ("--top-builddir"):
+ 			config["top-builddir"] = arg
+ 
+ 
+ 	if not config.has_key('bindir'):
+ 		bindir = '@bindir@'
+ 		if bindir[0] == '/':
+ 			bindir = bindir[1:]
+ 		config['bindir'] = os.path.join(config['inst_dir'], bindir)
+ 	if not config.has_key('libdir'):
+ 		libdir = '@libdir@'
+ 		if libdir[0] == '/':
+ 			libdir = libdir[1:]
+ 		config['libdir'] = os.path.join(config['inst_dir'], libdir)
+ 	if not config.has_key('datadir'):
+ 		datadir = '@datadir@'
+ 		if datadir[0] == '/':
+ 			datadir = datadir[1:]
+ 		config['datadir'] = os.path.join(config['inst_dir'], datadir)
+ 
+ 
+ 	# FIXME: should not have to be here
+ 	logger = Logger(config['logfile'])
+ 	config['main_logging_hook'] = (EventMatcher(Event), logger.callback)
+ 
+ 
+ 	# definition of tests and suites, including their dependencies
+ 	tdef = {
+ 		# runs 'make install' to make sure the installation is up to date
+ 		'temp_install':		{'class': InstallationSuite,
+ 							 'uses': ('__system__',)},
+ 
+ 		# runs initdb, providing the Postgres data directory
+ 		'initdb-0':			{'class': InitdbSuite,
+ 							 'uses': ('temp_install',),
+ 							 'args': (0,)},
+ 
+ 		# runs a postmaster on the created database directory
+ 		'pg-0':				{'class': PostmasterSuite,
+ 							 'uses': ('temp_install', 'initdb-0')},
+ 
+ 		# creates a test database on pg-0
+ 		'testdb':			{'class': TestDatabaseSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',)},
+ 
+ 		# open two connections
+ 		'conn-0A':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 		'conn-0B':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 		'conn-0C':			{'class': SqlConnectionSuite,
+ 							 'uses': ('temp_install', 'pg-0'),
+ 							 'args': ('testdb',),
+ 							 'depends': ('testdb',)},
+ 
+ 		# test the connections
+ 		'test-conn-0A':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0A',)},
+ 		'test-conn-0B':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0B',)},
+ 		'test-conn-0C':		{'class': TestDatabaseConnection,
+ 							 'uses': ('conn-0C',)},
+ 
+ #		'dummy-recursion':	{'class': DummyPermutationTest},
+ 
+ 		# populate the test database
+ 		'populate-testdb':	{'class': PopulateTestDatabase,
+ 							 'uses': ('conn-0A',),
+ 							 'onlyAfter': ('test-conn-0A', 'test-conn-0B',
+ 										   'test-conn-0C')},
+ 
+ 		'simple-write-skew':	{'class': SimpleWriteSkewTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('populate-testdb',),
+ 							 'xfail': True},
+ 
+ 		'receipt-report':	{'class': ReceiptReportTest,
+ 							 'uses': ('conn-0A', 'conn-0B', 'conn-0C'),
+ 							 'onlyAfter': ('simple-write-skew',),
+ 							 'xfail': True},
+ 
+ 		'temporal-range':	{'class': TemporalRangeIntegrityTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('receipt-report',),
+ 							 'xfail': True},
+ 
+ 		'project-manager':	{'class': ProjectManagerTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('temporal-range',),
+ 							 'xfail': True},
+ 
+ 		'classroom-scheduling':	{'class': ClassroomSchedulingTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('project-manager',),
+ 							 'xfail': True},
+ 
+ 		'total-cash':		{'class': TotalCashTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('classroom-scheduling',),
+ 							 'xfail': True},
+ 
+ 		'referential-integrity':	{'class': ReferentialIntegrityTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('total-cash',),
+ 							 'xfail': True},
+ 
+ 		'ri-trigger':		{'class': RITriggerTest,
+ 							 'uses': ('conn-0A', 'conn-0B'),
+ 							 'onlyAfter': ('referential-integrity',),
+ 							 'xfail': True}
+ 
+ #		'ser-updates':		{'class': TestTrueSerializabilityConcurrentUpdates,
+ #							 'uses': ('conn-0A', 'conn-0B', 'conn-0C'),
+ #							 'onlyAfter': ('populate-testdb',),
+ #							 'xfail': True},
+ #
+ #		'ser-insert':		{'class': TestTrueSerializabilityConcurrentInsert,
+ #							 'uses': ('conn-0A', 'conn-0B'),
+ #							 'onlyAfter': ('ser-updates',),
+ #							 'xfail': True},
+ #
+ #		'ser-insert2':		{'class': TestTrueSerializabilityConcurrentInsert2,
+ #							 'uses': ('conn-0A', 'conn-0B'),
+ #							 'onlyAfter': ('ser-insert',)}
+ 	}
+ 
+ 
+ 	runner = Runner(testTimeout=600, suiteTimeout=3600)
+ 	runner.run(tdef, config)
+ 
+ 
+ if __name__ == "__main__":
+ 	main(sys.argv[1:])
+