[GSOC] Eliminate O(N^2) scaling from rw-conflict tracking in serializable transactions

Started by Mengxing Liuover 8 years ago3 messages
#1Mengxing Liu
liu-mx15@mails.tsinghua.edu.cn
1 attachment(s)

Hi, all. There was a very strange phenomenon I couldn't explain. So I was wondering if you can help me.

I was trying to replace the linked list with a skip list in serializable transaction object for faster conflict tracking. But the performance is bad.
So I used the instruction "rdtsc" to compare the speed of my skip list and the original linked list. The skip list was about 1.5x faster.

The interesting thing is that if I added the instruction "rdstc" at the end of the function "RWConflictExists",
the performance of the whole system was increased by at most 3 times!
Here is the result.

| benchmarks | without rdtsc | with rdtsc |
| simpe read/write | 4.91 | 14.16 |
| ssibench | 9.72 | 10.24 |
| tpcb | 26.45 | 26.38 |

( The simple read/write benchmark has the most number of conflicts. )

The patch is attached. All the difference of the two columns is with/without a simple line of code:
__asm__ __volatile__ ("rdtsc");
But I don't know why this instruction will influence the performance so much!

BTW, after adding the "rdtsc" instruction, the performance is better than the original version about 10% at most.
That means, the skip list can work!

Looking forward to your advices.

--
Sincerely

Mengxing Liu

Attachments:

skip-list-for-conflict-tracking.patchapplication/octet-stream; name=skip-list-for-conflict-tracking.patchDownload
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 27c4af9..e25166b 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -628,6 +628,104 @@ NextPredXact(SERIALIZABLEXACT *sxact)
 }
 
 /*------------------------------------------------------------------------*/
+/*
+ * These functions manage access to the skip list
+ */
+
+#define GET_SKIPLIST_VALUE(elem, offset) \
+			(*(unsigned long*)(((char*)(elem)) + (offset)))
+#define SKIPLIST_VALUE_OFFSET(type, skiplist_field, value_field) \
+			(offsetof(type, value_field) - offsetof(type, skiplist_field))
+
+
+static inline bool
+SkipListFindElem(SHM_SKIPLIST* skiplist, Size valueOffset, unsigned long value,
+	SHM_SKIPLIST** topLinkPreElem, SHM_SKIPLIST** belowLinkPreElem)
+{
+	bool result;
+	unsigned long curValue;
+	SHM_SKIPLIST* curElem, *nextElem;
+
+	curElem = skiplist;
+	result = false;
+	nextElem = (SHM_SKIPLIST*) SHMQueueNext(&(skiplist->topLink), 
+										&(curElem->topLink), 
+										offsetof(SHM_SKIPLIST, topLink));
+	while (nextElem)
+	{
+		curValue = GET_SKIPLIST_VALUE(nextElem, valueOffset);
+		if (curValue == value)
+		{
+			if (topLinkPreElem)
+				*topLinkPreElem = nextElem;
+			if (belowLinkPreElem)
+				*belowLinkPreElem = nextElem;
+			return true;
+		}
+		if (curValue > value)
+			break;
+		curElem = nextElem;
+		nextElem = (SHM_SKIPLIST*)SHMQueueNext(&skiplist->topLink, 
+										&nextElem->topLink, 
+										offsetof(SHM_SKIPLIST, topLink));
+	}
+	if (topLinkPreElem)
+		*topLinkPreElem = curElem;
+
+	nextElem = (SHM_SKIPLIST*)SHMQueueNext(&(skiplist->belowLink),
+										&(curElem->belowLink),
+										offsetof(SHM_SKIPLIST, belowLink));
+	while (nextElem)
+	{
+		curValue = GET_SKIPLIST_VALUE(nextElem, valueOffset);
+		if (curValue == value)
+		{
+			result = true;
+			break;
+		}
+		if (curValue > value)
+			break;
+		curElem = nextElem;
+		nextElem = (SHM_SKIPLIST*) SHMQueueNext(&skiplist->belowLink,
+											&nextElem->belowLink,
+											offsetof(SHM_SKIPLIST, belowLink));
+	}
+
+	if (belowLinkPreElem)
+		*belowLinkPreElem = curElem;
+	return result;
+}
+
+static inline void 
+SkipListInsert(SHM_SKIPLIST* skiplist, Size valueOffset, SHM_SKIPLIST* elem)
+{
+	unsigned long elemValue;
+	SHM_SKIPLIST* topLinkPreElem, *belowLinkPreElem;
+	bool find;
+	topLinkPreElem = NULL;
+	belowLinkPreElem = NULL;
+
+	elemValue = GET_SKIPLIST_VALUE(elem, valueOffset);
+	find = SkipListFindElem(skiplist, valueOffset, elemValue, &topLinkPreElem, &belowLinkPreElem);
+	Assert(find == false);
+	Assert(topLinkPreElem);
+	Assert(belowLinkPreElem);
+
+    /* Because most of lists have less than 10 elements; so we make the top link skips average 3 elements*/
+	if (rand()%3 == 0)
+		SHMQueueInsertAfter(&topLinkPreElem->topLink, &elem->topLink);
+	SHMQueueInsertAfter(&belowLinkPreElem->belowLink, &elem->belowLink);
+}
+
+static inline bool
+SkipListExist(SHM_SKIPLIST* skiplist, Size valueOffset, unsigned long value)
+{
+	return SkipListFindElem(skiplist, valueOffset, value, NULL, NULL);
+}
+
+
+
+/*------------------------------------------------------------------------*/
 
 /*
  * These functions manage primitive access to the RWConflict pool and lists.
@@ -636,7 +734,11 @@ static bool
 RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
 {
 	RWConflict	conflict;
-
+	Size 		valueOffset;
+    unsigned long long _begin, _end;
+    bool result;
+    
+    result=  false;
 	Assert(reader != writer);
 
 	/* Check the ends of the purported conflict first. */
@@ -645,24 +747,36 @@ RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
 		|| SHMQueueEmpty(&reader->outConflicts)
 		|| SHMQueueEmpty(&writer->inConflicts))
 		return false;
-
-	/* A conflict is possible; walk the list to find out. */
-	conflict = (RWConflict)
-		SHMQueueNext(&reader->outConflicts,
+    
+    /* if there are more than 10 conflicts in the outConflicts, we use skip list to search; 
+     * Otherwise we use linked list directly.
+     */
+    if (reader->outConflictsNum > 10 )
+    {
+		valueOffset = SKIPLIST_VALUE_OFFSET(RWConflictData, outTopLink, sxactIn);
+		result = SkipListExist((SHM_SKIPLIST*)&reader->outConflictsTopLink, 
+					  valueOffset,
+					  GET_SKIPLIST_VALUE(reader, valueOffset));
+	}else{
+        conflict = (RWConflict)
+	    	SHMQueueNext(&reader->outConflicts,
 					 &reader->outConflicts,
 					 offsetof(RWConflictData, outLink));
-	while (conflict)
-	{
-		if (conflict->sxactIn == writer)
-			return true;
-		conflict = (RWConflict)
-			SHMQueueNext(&reader->outConflicts,
+	    while (conflict)
+    	{
+		    if (conflict->sxactIn == writer){
+			    result= true;
+                break;
+	    	}
+		    conflict = (RWConflict)
+			    SHMQueueNext(&reader->outConflicts,
 						 &conflict->outLink,
 						 offsetof(RWConflictData, outLink));
-	}
+	    }
 
-	/* No conflict found. */
-	return false;
+    }
+    __asm__ __volatile__ ("rdtsc");
+    return result;
 }
 
 static void
@@ -684,11 +798,24 @@ SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
 				 errhint("You might need to run fewer transactions at a time or increase max_connections.")));
 
 	SHMQueueDelete(&conflict->outLink);
+	SHMQueueInit(&conflict->outTopLink);
+	SHMQueueInit(&conflict->inTopLink);
 
 	conflict->sxactOut = reader;
 	conflict->sxactIn = writer;
-	SHMQueueInsertBefore(&reader->outConflicts, &conflict->outLink);
-	SHMQueueInsertBefore(&writer->inConflicts, &conflict->inLink);
+
+//	SHMQueueInsertBefore(&reader->outConflicts, &conflict->outLink);
+//	SHMQueueInsertBefore(&writer->inConflicts, &conflict->inLink);
+	SkipListInsert((SHM_SKIPLIST*)&reader->outConflictsTopLink, 
+					SKIPLIST_VALUE_OFFSET(RWConflictData, outTopLink, sxactIn),
+					(SHM_SKIPLIST*)&conflict->outTopLink);
+	SkipListInsert((SHM_SKIPLIST*)&writer->inConflictsTopLink,
+					SKIPLIST_VALUE_OFFSET(RWConflictData, inTopLink, sxactOut),
+					(SHM_SKIPLIST*)&conflict->inTopLink);
+
+	reader->outConflictsNum++;
+	writer->inConflictsNum++;
+
 }
 
 static void
@@ -726,6 +853,10 @@ ReleaseRWConflict(RWConflict conflict)
 {
 	SHMQueueDelete(&conflict->inLink);
 	SHMQueueDelete(&conflict->outLink);
+	if (conflict->inTopLink.prev)
+		SHMQueueDelete(&conflict->inTopLink);
+	if (conflict->outTopLink.prev)
+		SHMQueueDelete(&conflict->outTopLink);
 	SHMQueueInsertBefore(&RWConflictPool->availableList, &conflict->outLink);
 }
 
@@ -1204,7 +1335,13 @@ InitPredicateLocks(void)
 		PredXact->OldCommittedSxact->prepareSeqNo = 0;
 		PredXact->OldCommittedSxact->commitSeqNo = 0;
 		PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
+
+		PredXact->OldCommittedSxact->outConflictsNum = 0;
+		PredXact->OldCommittedSxact->inConflictsNum = 0;
+		
+		SHMQueueInit(&PredXact->OldCommittedSxact->outConflictsTopLink);
 		SHMQueueInit(&PredXact->OldCommittedSxact->outConflicts);
+		SHMQueueInit(&PredXact->OldCommittedSxact->inConflictsTopLink);
 		SHMQueueInit(&PredXact->OldCommittedSxact->inConflicts);
 		SHMQueueInit(&PredXact->OldCommittedSxact->predicateLocks);
 		SHMQueueInit(&PredXact->OldCommittedSxact->finishedLink);
@@ -1796,8 +1933,14 @@ GetSerializableTransactionSnapshotInt(Snapshot snapshot,
 	sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
 	sxact->prepareSeqNo = InvalidSerCommitSeqNo;
 	sxact->commitSeqNo = InvalidSerCommitSeqNo;
+
+	sxact->outConflictsNum = 0;
+	SHMQueueInit(&(sxact->outConflictsTopLink));
 	SHMQueueInit(&(sxact->outConflicts));
+	sxact->inConflictsNum = 0;
+	SHMQueueInit(&(sxact->inConflictsTopLink));
 	SHMQueueInit(&(sxact->inConflicts));
+
 	SHMQueueInit(&(sxact->possibleUnsafeConflicts));
 	sxact->topXid = GetTopTransactionIdIfAny();
 	sxact->finishedBefore = InvalidTransactionId;
@@ -3423,8 +3566,11 @@ ReleasePredicateLocks(bool isCommit)
 
 		if (!isCommit
 			|| SxactIsCommitted(conflict->sxactIn)
-			|| (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
+			|| (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo)){
+			MySerializableXact->outConflictsNum--;
+            conflict->sxactIn->inConflictsNum--;
 			ReleaseRWConflict(conflict);
+		}
 
 		conflict = nextConflict;
 	}
@@ -3446,8 +3592,11 @@ ReleasePredicateLocks(bool isCommit)
 
 		if (!isCommit
 			|| SxactIsCommitted(conflict->sxactOut)
-			|| SxactIsReadOnly(conflict->sxactOut))
+			|| SxactIsReadOnly(conflict->sxactOut)){
+			MySerializableXact->inConflictsNum--;
+            conflict->sxactOut->outConflictsNum--;
 			ReleaseRWConflict(conflict);
+		}
 
 		conflict = nextConflict;
 	}
@@ -3839,6 +3988,8 @@ ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
 			if (summarize)
 				conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
 			ReleaseRWConflict(conflict);
+			sxact->outConflictsNum--;
+            conflict->sxactIn->inConflictsNum--;
 			conflict = nextConflict;
 		}
 	}
@@ -3857,6 +4008,8 @@ ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
 		if (summarize)
 			conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
 		ReleaseRWConflict(conflict);
+		sxact->inConflictsNum--;
+        conflict->sxactOut->outConflictsNum--;
 		conflict = nextConflict;
 	}
 
@@ -4969,7 +5122,9 @@ predicatelock_twophase_recover(TransactionId xid, uint16 info,
 		 * we'll conservatively assume that it had both a conflict in and a
 		 * conflict out, and represent that with the summary conflict flags.
 		 */
+		SHMQueueInit(&(sxact->outConflictsTopLink));
 		SHMQueueInit(&(sxact->outConflicts));
+		SHMQueueInit(&(sxact->inConflictsTopLink));
 		SHMQueueInit(&(sxact->inConflicts));
 		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
 		sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h
index 3cb0ab9..dbc5bcf 100644
--- a/src/include/storage/predicate_internals.h
+++ b/src/include/storage/predicate_internals.h
@@ -83,8 +83,13 @@ typedef struct SERIALIZABLEXACT
 		SerCommitSeqNo lastCommitBeforeSnapshot;		/* when not committed or
 														 * no conflict out */
 	}			SeqNo;
+	int 		outConflictsNum;
+	SHM_QUEUE	outConflictsTopLink;
 	SHM_QUEUE	outConflicts;	/* list of write transactions whose data we
 								 * couldn't read. */
+
+	int 		inConflictsNum;
+	SHM_QUEUE	inConflictsTopLink;
 	SHM_QUEUE	inConflicts;	/* list of read transactions which couldn't
 								 * see our write. */
 	SHM_QUEUE	predicateLocks; /* list of associated PREDICATELOCK objects */
@@ -182,6 +187,15 @@ typedef struct PredXactListData *PredXactList;
 		((Size)MAXALIGN(sizeof(PredXactListData)))
 
 
+/* 
+ * 
+ */
+typedef struct SHM_SKIPLIST
+{
+	SHM_QUEUE topLink;
+	SHM_QUEUE belowLink;
+}SHM_SKIPLIST;
+
 /*
  * The following types are used to provide lists of rw-conflicts between
  * pairs of transactions.  Since exactly the same information is needed,
@@ -194,7 +208,9 @@ typedef struct PredXactListData *PredXactList;
  */
 typedef struct RWConflictData
 {
+	SHM_QUEUE	outTopLink;
 	SHM_QUEUE	outLink;		/* link for list of conflicts out from a sxact */
+	SHM_QUEUE	inTopLink;
 	SHM_QUEUE	inLink;			/* link for list of conflicts in to a sxact */
 	SERIALIZABLEXACT *sxactOut;
 	SERIALIZABLEXACT *sxactIn;
#2Robert Haas
robertmhaas@gmail.com
In reply to: Mengxing Liu (#1)
Re: [GSOC] Eliminate O(N^2) scaling from rw-conflict tracking in serializable transactions

On Wed, Jul 26, 2017 at 11:41 AM, Mengxing Liu <
liu-mx15@mails.tsinghua.edu.cn> wrote:

Hi, all. There was a very strange phenomenon I couldn't explain. So I was
wondering if you can help me.

I was trying to replace the linked list with a skip list in serializable
transaction object for faster conflict tracking. But the performance is bad.
So I used the instruction "rdtsc" to compare the speed of my skip list and
the original linked list. The skip list was about 1.5x faster.

The interesting thing is that if I added the instruction "rdstc" at the
end of the function "RWConflictExists",
the performance of the whole system was increased by at most 3 times!
Here is the result.

benchmarks without rdtsc with rdtsc
simpe read/write 4.91 14.16
ssibench 9.72 10.24
tpcb 26.45 26.38

( The simple read/write benchmark has the most number of conflicts. )

The patch is attached. All the difference of the two columns is
with/without a simple line of code:
__asm__ __volatile__ ("rdtsc");
But I don't know why this instruction will influence the performance so
much!

Lock contention is really expensive, so a slight delay that is just long
enough to prevent the contention from happening can sometimes improve
performance. This example is surprisingly dramatic, though. Of course, we
can't commit it this way -- it will break on non-x86.

I would suggest that you gather information on what wait events are
occurring in the "without rdtsc" case. Like this:

$ script
$ psql
psql=> select wait_event from pg_stat_activity;
psql=> \watch 0.5
...run test in another window...
^C
\q
^D
...use awk or perl or something to count up the wait events and see where
the contention is happening...

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

#3Mengxing Liu
liu-mx15@mails.tsinghua.edu.cn
In reply to: Robert Haas (#2)
Re: [GSOC] Eliminate O(N^2) scaling from rw-conflict tracking in serializable transactions

Thanks for your reply.

Actually, the result of without "rdtsc" is reasonable. I used perf to analyze the performance and found that
even thought the function tracking conflicts (RWConflictExists) was faster, the function inserting conflicts (SetRWConflict)
was too slower. According to your suggestion, I found there were much more waiting events of predicate_lock_manager.
That means, slower SetRWConflict resulted in more lock conflicts.
So I took some effort to made it faster in the last days.

Why the code with "rdtsc" is much faster? I thought that may be caused by some mistakes.
When I changed a machine to run the code, this phenomenon didn't happen anymore..
-----Original Messages-----
From: "Robert Haas" <robertmhaas@gmail.com>
Sent Time: 2017-07-29 02:46:47 (Saturday)
To: "Mengxing Liu" <liu-mx15@mails.tsinghua.edu.cn>
Cc: "Alvaro Herrera" <alvherre@2ndquadrant.com>, kgrittn <kgrittn@gmail.com>, "pgsql-hackers@postgresql.org" <pgsql-hackers@postgresql.org>
Subject: Re: [HACKERS] [GSOC] Eliminate O(N^2) scaling from rw-conflict tracking in serializable transactions

On Wed, Jul 26, 2017 at 11:41 AM, Mengxing Liu <liu-mx15@mails.tsinghua.edu.cn> wrote:

Hi, all. There was a very strange phenomenon I couldn't explain. So I was wondering if you can help me.

I was trying to replace the linked list with a skip list in serializable transaction object for faster conflict tracking. But the performance is bad.
So I used the instruction "rdtsc" to compare the speed of my skip list and the original linked list. The skip list was about 1.5x faster.

The interesting thing is that if I added the instruction "rdstc" at the end of the function "RWConflictExists",
the performance of the whole system was increased by at most 3 times!
Here is the result.

| benchmarks | without rdtsc | with rdtsc |
| simpe read/write | 4.91 | 14.16 |
| ssibench | 9.72 | 10.24 |
| tpcb | 26.45 | 26.38 |

( The simple read/write benchmark has the most number of conflicts. )

The patch is attached. All the difference of the two columns is with/without a simple line of code:
__asm__ __volatile__ ("rdtsc");
But I don't know why this instruction will influence the performance so much!

Lock contention is really expensive, so a slight delay that is just long enough to prevent the contention from happening can sometimes improve performance. This example is surprisingly dramatic, though. Of course, we can't commit it this way -- it will break on non-x86.

I would suggest that you gather information on what wait events are occurring in the "without rdtsc" case. Like this:

$ script

$ psql

psql=> select wait_event from pg_stat_activity;

psql=> \watch 0.5

...run test in another window...

^C

\q

^D

...use awk or perl or something to count up the wait events and see where the contention is happening...

--

Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--

Sincerely

Mengxing Liu