diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a2a7e0c520..d6b19bb9ad 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1612,6 +1612,231 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 									rterm->pathtarget->width);
 }
 
+/*
+ * is_fake_var
+ *		Workaround for generate_append_tlist() which generates fake Vars with
+ *		varno == 0, that will cause a fail of estimate_num_group() call
+ */
+static bool
+is_fake_var(Expr *expr)
+{
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	return (IsA(expr, Var) && ((Var *) expr)->varno == 0);
+}
+
+/*
+ * get_width_cost_multiplier
+ *		Returns relative complexity of comparing two valyes based on it's width.
+ * The idea behind - long values have more expensive comparison. Return value is
+ * in cpu_operator_cost unit.
+ */
+static double
+get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
+{
+	double	width = -1.0; /* fake value */
+
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	/* Try to find actual stat in corresonding relation */
+	if (IsA(expr, Var))
+	{
+		Var		*var = (Var *) expr;
+
+		if (var->varno > 0 && var->varno < root->simple_rel_array_size)
+		{
+			RelOptInfo	*rel = root->simple_rel_array[var->varno];
+
+			if (rel != NULL &&
+				var->varattno >= rel->min_attr &&
+				var->varattno <= rel->max_attr)
+			{
+				int	ndx = var->varattno - rel->min_attr;
+
+				if (rel->attr_widths[ndx] > 0)
+					width = rel->attr_widths[ndx];
+			}
+		}
+	}
+
+	/* Didn't find any actual stats, use estimation by type */
+	if (width < 0.0)
+	{
+		Node	*node = (Node*) expr;
+
+		width = get_typavgwidth(exprType(node), exprTypmod(node));
+	}
+
+	/*
+	 * Any value in pgsql is passed by Datum type, so any operation with value
+	 * could not be cheaper than operation with Datum type
+	 */
+	if (width <= sizeof(Datum))
+		return 1.0;
+
+	/*
+	 * Seems, cost of comparision is not directly proportional to args width,
+	 * because comparing args could be differ width (we known only average over
+	 * column) and difference often could be defined only by looking on first
+	 * bytes. So, use log16(width) as estimation.
+	 */
+	return 1.0 + 0.125 * LOG2(width / sizeof(Datum));
+}
+
+
+/*
+ * compute_cpu_sort_cost
+ *		compute CPU cost of sort (i.e. in-memory)
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys.  In this case, it will fallback to
+ * simple comparison cost estimate.
+ */
+
+static Cost
+compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys,
+						   Cost comparison_cost,
+						   double tuples, double output_tuples, bool heapSort)
+{
+	Cost		per_tuple_cost = comparison_cost;
+	ListCell	*lc;
+	List		*pathkeyExprs = NIL;
+	double		tuplesPerGroup;
+	bool		has_fake_var = false;
+
+	/* fallback if pathkeys is unknown */
+	if (list_length(pathkeys) == 0)
+	{
+		/*
+		 * If we'll use a bounded heap-sort keeping just K tuples in memory, for
+		 * a total number of tuple comparisons of N log2 K; but the constant
+		 * factor is a bit higher than for quicksort.  Tweak it so that the
+		 * cost curve is continuous at the crossover point.
+		 */
+		output_tuples = (heapSort) ? 2.0 * output_tuples : tuples;
+		per_tuple_cost += 2.0 * cpu_operator_cost * LOG2(output_tuples);
+
+		return per_tuple_cost * tuples;
+	}
+
+	/*
+	 * Computing total cost of sorting takes into account:
+	 * - per column comparison function cost
+	 * - we try to compute needed number of comparison per column
+	 */
+
+	foreach(lc, pathkeys)
+	{
+		PathKey	*pathkey = (PathKey*)lfirst(lc);
+		Cost	funcCost = 1.0; /* fallback value */
+		EquivalenceMember	*em;
+
+		/*
+		 * We believe than equivalence members aren't very  different, so, to
+		 * estimate cost we take just first member
+		 */
+		em = (EquivalenceMember *) linitial(pathkey->pk_eclass->ec_members);
+
+		if (em->em_datatype != InvalidOid)
+		{
+			Oid		sortop;
+
+			sortop = get_opfamily_member(pathkey->pk_opfamily,
+										 em->em_datatype, em->em_datatype,
+										 pathkey->pk_strategy);
+
+			if (sortop != InvalidOid)
+			{
+				Oid	funcOid = get_opcode(sortop);
+
+				funcCost = get_func_cost(funcOid);
+			}
+		}
+
+		/* Try to take into account actual width fee */
+		funcCost *= get_width_cost_multiplier(root, em->em_expr);
+
+		if (pathkeyExprs == NIL)
+		{
+			/*
+			 * First column is compared always. Multiplier 2.0 is empirical,
+			 * seems, it takes into account transfer tuple into CPU cache,
+			 * starting steps of tuple unpaking and so on.
+			 */
+			per_tuple_cost += 2.0 * funcCost * LOG2(tuples);
+		}
+		else
+		{
+			double		n_groups;
+
+			/*
+			 * Prevent call estimate_num_groups() with fake Var. Note,
+			 * pathkeyExprs contains only previous columns
+			 */
+			if (has_fake_var == false)
+				n_groups = estimate_num_groups(root, pathkeyExprs, tuples, NULL);
+			else if (tuples > 4.0)
+				/*
+				 * Use geometric mean as estimation if there is no any stats.
+				 * Don't use DEFAULT_NUM_DISTINCT because it used for only one
+				 * column while here we try to estimate number of groups over
+				 * set of columns.
+				 */
+				n_groups = ceil(2.0 + sqrt(tuples) *
+					(1 + list_length(pathkeyExprs)) / list_length(pathkeys));
+			else
+				n_groups = tuples;
+
+			/*
+			 * Second and following columns are compared only if header columns
+			 * are the same. Hence, it will be called log2(number of tuples in
+			 * group).
+			 * if number of tuples in group is equal = 1 then comparison
+			 * function of current and followed columns will not be called at
+			 * all.
+			 */
+			tuplesPerGroup = tuples/n_groups;
+
+			if (tuplesPerGroup > 1.0)
+			{
+				/* Top-N sort case: sort only inside output_tuples */
+				if (tuplesPerGroup > output_tuples)
+					tuplesPerGroup = output_tuples;
+
+				/* See comment above about heapsort */
+				if (heapSort)
+					tuplesPerGroup *= 2.0;
+
+				/*
+				 * Real-world distribution isn't uniform but now we don't have a
+				 * way to determine that, so, add multiplier to get closer to
+				 * worst case.
+				 */
+				per_tuple_cost += 1.5 * funcCost * LOG2(tuplesPerGroup);
+			}
+			else
+			{
+				/* we could skip all followed columns for cost estimation */
+				break;
+			}
+		}
+
+		/* remeber if we have a fake var in pathkeys */
+		has_fake_var |= is_fake_var(em->em_expr);
+
+		pathkeyExprs = lappend(pathkeyExprs, em->em_expr);
+	}
+
+	list_free(pathkeyExprs);
+
+	/* per_tuple_cost is in cpu_operator_cost units */
+	per_tuple_cost *= cpu_operator_cost;
+
+	return tuples * per_tuple_cost;
+}
+
 /*
  * cost_sort
  *	  Determines and returns the cost of sorting a relation, including
@@ -1628,7 +1853,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * number of initial runs formed and M is the merge order used by tuplesort.c.
  * Since the average initial run should be about sort_mem, we have
  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
- *		cpu = comparison_cost * t * log2(t)
+ * and cpu cost computed by compute_cpu_sort_cost().
  *
  * If the sort is bounded (i.e., only the first k result tuples are needed)
  * and k tuples can fit into sort_mem, we use a heap method that keeps only
@@ -1649,13 +1874,6 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * 'comparison_cost' is the extra cost per comparison, if any
  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
- *
- * NOTE: some callers currently pass NIL for pathkeys because they
- * can't conveniently supply the sort keys.  Since this routine doesn't
- * currently do anything with pathkeys anyway, that doesn't matter...
- * but if it ever does, it should react gracefully to lack of key data.
- * (Actually, the thing we'd most likely be interested in is just the number
- * of sort keys, which all callers *could* supply.)
  */
 void
 cost_sort(Path *path, PlannerInfo *root,
@@ -1682,9 +1900,6 @@ cost_sort(Path *path, PlannerInfo *root,
 	if (tuples < 2.0)
 		tuples = 2.0;
 
-	/* Include the default cost-per-comparison */
-	comparison_cost += 2.0 * cpu_operator_cost;
-
 	/* Do we have a useful LIMIT? */
 	if (limit_tuples > 0 && limit_tuples < tuples)
 	{
@@ -1713,7 +1928,9 @@ cost_sort(Path *path, PlannerInfo *root,
 		 *
 		 * Assume about N log2 N comparisons
 		 */
-		startup_cost += comparison_cost * tuples * LOG2(tuples);
+		startup_cost += compute_cpu_sort_cost(root, pathkeys,
+											  comparison_cost, tuples,
+											  tuples, false);
 
 		/* Disk costs */
 
@@ -1729,18 +1946,17 @@ cost_sort(Path *path, PlannerInfo *root,
 	}
 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
 	{
-		/*
-		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
-		 * a total number of tuple comparisons of N log2 K; but the constant
-		 * factor is a bit higher than for quicksort.  Tweak it so that the
-		 * cost curve is continuous at the crossover point.
-		 */
-		startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
+		/* We'll use a bounded heap-sort keeping just K tuples in memory. */
+		startup_cost += compute_cpu_sort_cost(root, pathkeys,
+											  comparison_cost, tuples,
+											  output_tuples, true);
 	}
 	else
 	{
 		/* We'll use plain quicksort on all the input tuples */
-		startup_cost += comparison_cost * tuples * LOG2(tuples);
+		startup_cost += compute_cpu_sort_cost(root, pathkeys,
+											  comparison_cost, tuples,
+											  tuples, false);
 	}
 
 	/*