From 351e5e05b471255d0f831d330f48ce95f16095eb Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandbossart@gmail.com>
Date: Sun, 19 Mar 2023 23:21:58 -0700
Subject: [PATCH v1 2/3] Introduce basic vector support.

This change introduces a variety of SQL-callable functions for
"vectors", or 1-D float8 arrays with no NULL values and at least
one element.
---
 doc/src/sgml/func.sgml                | 241 +++++++++++
 src/backend/utils/adt/Makefile        |   1 +
 src/backend/utils/adt/meson.build     |   1 +
 src/backend/utils/adt/vector.c        | 589 ++++++++++++++++++++++++++
 src/include/catalog/pg_proc.dat       |  49 +++
 src/test/regress/expected/vectors.out | 154 +++++++
 src/test/regress/parallel_schedule    |   2 +-
 src/test/regress/sql/vectors.sql      |  61 +++
 8 files changed, 1097 insertions(+), 1 deletion(-)
 create mode 100644 src/backend/utils/adt/vector.c
 create mode 100644 src/test/regress/expected/vectors.out
 create mode 100644 src/test/regress/sql/vectors.sql

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 5a47ce4343..1bd71ccefe 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -18950,6 +18950,247 @@ SELECT NULLIF(value, '(none)') ...
 </programlisting>
        </para></entry>
       </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_is_vector</primary>
+        </indexterm>
+        <function>array_is_vector</function> ( <type>float8[]</type> )
+        <returnvalue>boolean</returnvalue>
+       </para>
+       <para>
+        Returns whether the array is a vector (i.e., a one-dimensional array of
+        <type>float8</type> with no NULL values and at least one element).  The
+        values in such arrays are treated as Cartesian coordinates in
+        <literal>n</literal>-dimensional Euclidean space, where
+        <literal>n</literal> is the cardinality of the array.
+       </para>
+       <para>
+        <literal>array_is_vector{'1,2,3}')</literal>
+        <returnvalue>t</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>euclidean_norm</primary>
+        </indexterm>
+        <function>euclidean_norm</function> ( <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the Euclidean norm of the array.  The array must be a vector
+        (see <function>array_is_vector</function> for details about what
+        constitutes a vector).
+       </para>
+       <para>
+        <literal>euclidean_norm('{1,2,3}')</literal>
+        <returnvalue>3.7416573867739413</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>normalize_vector</primary>
+        </indexterm>
+        <function>normalize_vector</function> ( <type>float8[]</type> )
+        <returnvalue>float8[]</returnvalue>
+       </para>
+       <para>
+        Returns a normalized version of the array.  The array must be a vector
+        (see <function>array_is_vector</function> for details about what
+        constitutes a vector), and its Euclidean norm must be nonzero.
+       </para>
+       <para>
+        <literal>normalize_vector('{1,2,3}')</literal>
+        <returnvalue>{0.2672612419124244,0.5345224838248488,0.8017837257372732}</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>dot_product</primary>
+        </indexterm>
+        <function>dot_product</function> ( <type>float8[]</type>, <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the dot product of the arrays.  The arrays must be vectors (see
+        <function>array_is_vector</function> for details about what constitutes
+        a vector).
+       </para>
+       <para>
+        <literal>dot_product('{1,2,3}', '{4,5,6}')</literal>
+        <returnvalue>32</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>squared_euclidean_distance</primary>
+        </indexterm>
+        <function>squared_euclidean_distance</function> ( <type>float8[]</type>, <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the squared Euclidean distance between the arrays.  The arrays
+        must be vectors (see <function>array_is_vector</function> for details
+        about what constitutes a vector).
+       </para>
+       <para>
+        <literal>squared_euclidean_distance('{1,2,3}', '{4,5,6}')</literal>
+        <returnvalue>27</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>euclidean_distance</primary>
+        </indexterm>
+        <function>euclidean_distance</function> ( <type>float8[]</type>, <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the Euclidean distance between the arrays.  The arrays must be
+        vectors (see <function>array_is_vector</function> for details about
+        what constitutes a vector).
+       </para>
+       <para>
+        <literal>euclidean_distance('{1,2,3}', '{4,5,6}')</literal>
+        <returnvalue>5.196152422706632</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>cosine_distance</primary>
+        </indexterm>
+        <function>cosine_distance</function> ( <type>float8[]</type>, <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the cosine distance between the arrays.  The arrays must be
+        vectors (see <function>array_is_vector</function> for details about
+        what constitutes a vector).
+       </para>
+       <para>
+        <literal>cosine_distance('{1,2,3}', '{4,5,6}')</literal>
+        <returnvalue>0.025368153802923787</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>taxicab_distance</primary>
+        </indexterm>
+        <function>taxicab_distance</function> ( <type>float8[]</type>, <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the taxicab distance between the arrays.  The arrays must be
+        vectors (see <function>array_is_vector</function> for details about
+        what constitutes a vector).
+       </para>
+       <para>
+        <literal>taxicab_distance('{1,2,3}', '{4,5,6}')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>chebyshev_distance</primary>
+        </indexterm>
+        <function>chebyshev_distance</function> ( <type>float8[]</type>, <type>float8[]</type> )
+        <returnvalue>float8</returnvalue>
+       </para>
+       <para>
+        Returns the Chebyshev distance between the arrays.  The arrays must be
+        vectors (see <function>array_is_vector</function> for details about
+        what constitutes a vector).
+       </para>
+       <para>
+        <literal>chebyshev_distance('{1,2,3}', '{4,5,6}')</literal>
+        <returnvalue>3</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>standard_unit_vector</primary>
+        </indexterm>
+        <function>standard_unit_vector</function> ( <parameter>d</parameter> <type>integer</type>, <parameter>n</parameter> <type>integer</type> )
+        <returnvalue>float8[]</returnvalue>
+       </para>
+       <para>
+        Returns an array of <type>float8</type> with <parameter>d</parameter>
+        elements.  All elements are set to <literal>0</literal> except for the
+        <parameter>n</parameter>th element, which is set to
+        <literal>1</literal>.
+       </para>
+       <para>
+        <literal>standard_unit_vector(3, 1)</literal>
+        <returnvalue>{1,0,0}</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>kmeans</primary>
+        </indexterm>
+        <function>kmeans</function> ( <parameter>n</parameter> <type>integer</type>, <parameter>v</parameter> <type>float8[]</type>, <parameter>i</parameter> <type>integer</type> )
+        <returnvalue>setof float8[]</returnvalue>
+       </para>
+       <para>
+        Returns <parameter>n</parameter> centers calculated via
+        <parameter>i</parameter> iterations of the <literal>kmeans++</literal>
+        algorithm on the array of vectors <parameter>v</parameter> (see
+        <function>array_is_vector</function> for details about what constitutes
+        a vector).
+       </para>
+       <para>
+        <literal>kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1)</literal>
+        <returnvalue></returnvalue>
+<programlisting>
+     kmeans
+----------------
+ {2.5,3.5,4.5}
+ {8.5,9.5,10.5}
+</programlisting>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>closest_vector</primary>
+        </indexterm>
+        <function>closest_vector</function> ( <parameter>c</parameter> <type>float8[]</type>, <parameter>v</parameter> <type>float8[]</type> )
+        <returnvalue>integer</returnvalue>
+       </para>
+       <para>
+        Returns the index of the vector in <parameter>c</parameter> that is
+        closest to <parameter>v</parameter>.  <parameter>c</parameter> must be
+        an array of vectors, and <parameter>v</parameter> must be a vector (see
+        <function>array_is_vector</function> for details about what constitutes
+        a vector).
+       </para>
+       <para>
+        <literal>closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', '{10,11,12}')</literal>
+        <returnvalue>2</returnvalue>
+       </para></entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index 0de0bbb1b8..ffda4038f1 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -114,6 +114,7 @@ OBJS = \
 	varbit.o \
 	varchar.o \
 	varlena.o \
+	vector.o \
 	version.o \
 	windowfuncs.o \
 	xid.o \
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index 8515cd9365..dfa4eeb52f 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -101,6 +101,7 @@ backend_sources += files(
   'varbit.c',
   'varchar.c',
   'varlena.c',
+  'vector.c',
   'version.c',
   'windowfuncs.c',
   'xid.c',
diff --git a/src/backend/utils/adt/vector.c b/src/backend/utils/adt/vector.c
new file mode 100644
index 0000000000..dba1fd7eef
--- /dev/null
+++ b/src/backend/utils/adt/vector.c
@@ -0,0 +1,589 @@
+/*-------------------------------------------------------------------------
+ *
+ * vector.c
+ *      Support functions for float8 arrays treated as vectors.
+ *
+ * For the purposes of this file, a "vector" is a one-dimensional float8
+ * array with no NULL values and at least one element.  The values in such
+ * arrays are treated as Cartesian coordinates in n-dimensional Euclidean
+ * space, where "n" is the cardinality of the array.  For more information,
+ * see https://en.wikipedia.org/wiki/Cartesian_coordinate_system.
+ *
+ * Copyright (c) 2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/backend/utils/adt/vector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <float.h>
+
+#include "catalog/pg_type.h"
+#include "common/int.h"
+#include "common/pg_prng.h"
+#include "funcapi.h"
+#include "utils/array.h"
+#include "utils/float.h"
+#include "utils/fmgrprotos.h"
+
+static inline int float8_array_is_vector_internal(ArrayType *v, bool error);
+static inline float8 euclidean_norm_internal(int n, const float8 *a);
+static inline float8 squared_euclidean_distance_internal(int n, const float8 *a, const float8 *b);
+static inline float8 euclidean_distance_internal(int n, const float8 *a, const float8 *b);
+
+/*
+ * Checks that an array is a vector (as described at the top of this file) and
+ * returns its cardinality.  If "error" is true, this function ERRORs if the
+ * array is not a vector.  If "error" is false, this function returns -1 if the
+ * array is not a vector.
+ */
+static inline int
+float8_array_is_vector_internal(ArrayType *v, bool error)
+{
+	int			n = ArrayGetNItems(ARR_NDIM(v), ARR_DIMS(v));
+
+	if (unlikely(n == 0))
+	{
+		if (error)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("vectors must have at least one element")));
+		return -1;
+	}
+
+	if (unlikely(ARR_NDIM(v) != 1))
+	{
+		if (error)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("vectors must be one-dimensional arrays")));
+		return -1;
+	}
+
+	if (unlikely(ARR_HASNULL(v) && array_contains_nulls(v)))
+	{
+		if (error)
+			ereport(ERROR,
+					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+					 errmsg("vectors must not contain nulls")));
+		return -1;
+	}
+
+	return n;
+}
+
+/*
+ * SQL-callable version of float8_array_is_vector_internal().  Unlike the
+ * internal version, this does not return the array's cardinality.
+ */
+Datum
+float8_array_is_vector(PG_FUNCTION_ARGS)
+{
+	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
+	bool		r = (float8_array_is_vector_internal(v, false) != -1);
+
+	PG_RETURN_BOOL(r);
+}
+
+/*
+ * Workhorse for euclidean_norm() and normalize_vector().
+ */
+static inline float8
+euclidean_norm_internal(int n, const float8 *a)
+{
+	float8		en = 0.0;
+
+	for (int i = 0; i < n; i++)
+		en = float8_pl(en, float8_mul(a[i], a[i]));
+
+	en = float8_sqrt(en);
+
+	return en;
+}
+
+/*
+ * Returns the Euclidean norm of a vector.  For more information, see
+ * https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
+ */
+Datum
+euclidean_norm(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	int			an = float8_array_is_vector_internal(a, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+
+	PG_RETURN_FLOAT8(euclidean_norm_internal(an, ad));
+}
+
+/*
+ * Returns a normalized version of the given vector.  For more information, see
+ * https://en.wikipedia.org/wiki/Unit_vector.
+ */
+Datum
+normalize_vector(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *ua;
+	int			an = float8_array_is_vector_internal(a, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8		en = euclidean_norm_internal(an, ad);
+	Datum	   *ud = palloc(an * sizeof(Datum));
+
+	for (int i = 0; i < an; i++)
+		ud[i] = Float8GetDatumFast(float8_div(ad[i], en));
+
+	ua = construct_array(ud, an, FLOAT8OID, sizeof(float8),
+						 FLOAT8PASSBYVAL, TYPALIGN_DOUBLE);
+
+	PG_RETURN_ARRAYTYPE_P(ua);
+}
+
+/*
+ * Returns the dot product of two vectors.  For more information, see
+ * https://en.wikipedia.org/wiki/Dot_product.
+ */
+Datum
+dot_product(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int			an = float8_array_is_vector_internal(a, true);
+	int			bn = float8_array_is_vector_internal(b, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8	   *bd = (float8 *) ARR_DATA_PTR(b);
+	float8		dp = 0.0;
+
+	if (an != bn)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	for (int i = 0; i < an; i++)
+		dp = float8_pl(dp, float8_mul(ad[i], bd[i]));
+
+	PG_RETURN_FLOAT8(dp);
+}
+
+/*
+ * Workhorse for squared_euclidean_distance() and
+ * euclidean_distance_internal().
+ */
+static inline float8
+squared_euclidean_distance_internal(int n, const float8 *a, const float8 *b)
+{
+	float8		sed = 0.0;
+
+	for (int i = 0; i < n; i++)
+	{
+		float8		d = float8_mi(a[i], b[i]);
+
+		sed = float8_pl(sed, float8_mul(d, d));
+	}
+
+	return sed;
+}
+
+/*
+ * Returns the squared Euclidean distance between two vectors.  For more
+ * information, see
+ * https://en.wikipedia.org/wiki/Euclidean_distance#Squared_Euclidean_distance.
+ */
+Datum
+squared_euclidean_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int			an = float8_array_is_vector_internal(a, true);
+	int			bn = float8_array_is_vector_internal(b, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8	   *bd = (float8 *) ARR_DATA_PTR(b);
+	float8		sed;
+
+	if (an != bn)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	sed = squared_euclidean_distance_internal(an, ad, bd);
+
+	PG_RETURN_FLOAT8(sed);
+}
+
+/*
+ * Workhorse for euclidean_distance().
+ */
+static inline float8
+euclidean_distance_internal(int n, const float8 *a, const float8 *b)
+{
+	float8		sed = squared_euclidean_distance_internal(n, a, b);
+
+	return float8_sqrt(sed);
+}
+
+/*
+ * Returns the Euclidean distance between vectors.  For more information, see
+ * https://en.wikipedia.org/wiki/Euclidean_distance.
+ */
+Datum
+euclidean_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int			an = float8_array_is_vector_internal(a, true);
+	int			bn = float8_array_is_vector_internal(b, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8	   *bd = (float8 *) ARR_DATA_PTR(b);
+	float8		ed;
+
+	if (an != bn)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	ed = euclidean_distance_internal(an, ad, bd);
+
+	PG_RETURN_FLOAT8(ed);
+}
+
+/*
+ * Returns the cosine distance between two vectors.  For more information, see
+ * https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_Distance.
+ */
+Datum
+cosine_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int			an = float8_array_is_vector_internal(a, true);
+	int			bn = float8_array_is_vector_internal(b, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8	   *bd = (float8 *) ARR_DATA_PTR(b);
+	float8		dp = 0.0;
+	float8		aen = 0.0;
+	float8		ben = 0.0;
+	float8		cd;
+
+	if (an != bn)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	for (int i = 0; i < an; i++)
+	{
+		dp = float8_pl(dp, float8_mul(ad[i], bd[i]));
+		aen = float8_pl(aen, float8_mul(ad[i], ad[i]));
+		ben = float8_pl(ben, float8_mul(bd[i], bd[i]));
+	}
+
+	cd = float8_mi(1.0, float8_div(dp, float8_sqrt(float8_mul(aen, ben))));
+
+	PG_RETURN_FLOAT8(cd);
+}
+
+/*
+ * Returns the taxicab distance between two vectors.  For more information, see
+ * https://en.wikipedia.org/wiki/Taxicab_geometry.
+ */
+Datum
+taxicab_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int			an = float8_array_is_vector_internal(a, true);
+	int			bn = float8_array_is_vector_internal(b, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8	   *bd = (float8 *) ARR_DATA_PTR(b);
+	float8		td = 0.0;
+
+	if (an != bn)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	for (int i = 0; i < an; i++)
+		td = float8_pl(td, fabs(float8_mi(ad[i], bd[i])));
+
+	PG_RETURN_FLOAT8(td);
+}
+
+/*
+ * Returns the Chebyshev distance between two vectors.  For more information,
+ * see https://en.wikipedia.org/wiki/Chebyshev_distance.
+ */
+Datum
+chebyshev_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int			an = float8_array_is_vector_internal(a, true);
+	int			bn = float8_array_is_vector_internal(b, true);
+	float8	   *ad = (float8 *) ARR_DATA_PTR(a);
+	float8	   *bd = (float8 *) ARR_DATA_PTR(b);
+	float8		cd = 0.0;
+
+	if (an != bn)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	for (int i = 0; i < an; i++)
+		cd = float8_max(cd, fabs(float8_mi(ad[i], bd[i])));
+
+	PG_RETURN_FLOAT8(cd);
+}
+
+/*
+ * Returns a standard unit vector with the nth element set to 1.  All other
+ * elements are set to 0.  For more information, see
+ * https://en.wikipedia.org/wiki/Standard_basis.
+ */
+Datum
+standard_unit_vector(PG_FUNCTION_ARGS)
+{
+	int			d = PG_GETARG_INT32(0);
+	int			n = PG_GETARG_INT32(1);
+	Datum	   *ud = palloc(d * sizeof(Datum));
+	ArrayType  *ua;
+
+	if (d <= 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("standard unit vectors must have at least one element")));
+
+	if (n == 0 || n > d)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("cannot set nonexistent element to one")));
+
+	for (int i = 0; i < d; i++)
+		ud[i] = Float8GetDatumFast(0.0);
+
+	ud[n - 1] = Float8GetDatumFast(1.0);
+
+	ua = construct_array(ud, d, FLOAT8OID, sizeof(float8),
+						 FLOAT8PASSBYVAL, TYPALIGN_DOUBLE);
+
+	PG_RETURN_ARRAYTYPE_P(ua);
+}
+
+/*
+ * Returns centers generated via the kmeans++ algorithm.  For more information,
+ * see https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf.
+ *
+ * XXX: Accelerate using triange inequality (see
+ * https://cseweb.ucsd.edu/~elkan/kmeansicml03.pdf).
+ */
+Datum
+kmeans(PG_FUNCTION_ARGS)
+{
+	int			ncenters = PG_GETARG_INT32(0);
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(1);
+	int			iterations = PG_GETARG_INT32(2);
+	int			nsamples = ARR_NDIM(a) > 1 ? ARR_DIMS(a)[0] : ARR_NDIM(a);
+	int			dimensions;
+	int		   *clusters = palloc(nsamples * sizeof(int));
+	int		   *pops = palloc(ncenters * sizeof(int));
+	float8	   **centers = palloc(ncenters * sizeof(float8 *));
+	float8	   **vectors = palloc(nsamples * sizeof(float8 *));
+	float8	   *weights = palloc(nsamples * sizeof(float8));
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum	   *cdata;
+	pg_prng_state rstate;
+
+	InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
+
+	if (ncenters < 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("must request at least one center")));
+
+	if (ncenters > nsamples)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("number of sample vectors must be greater than or equal to the number of requested centers")));
+
+	if (ARR_HASNULL(a) && array_contains_nulls(a))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("vectors must not contain nulls")));
+
+	if (iterations < 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("must request at least one iteration")));
+
+	dimensions = ARR_NDIM(a) > 1 ? ARR_DIMS(a)[1] : ARR_DIMS(a)[0];
+	for (int i = 0; i < nsamples; i++)
+	{
+		int			r = -1;
+
+		if (pg_mul_s32_overflow(i, dimensions, &r))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("sample vectors too large")));
+
+		vectors[i] = ((float8 *) ARR_DATA_PTR(a)) + r;
+	}
+
+	/* choose initial center randomly */
+	pg_prng_seed(&rstate, 0x5eed);
+	centers[0] = vectors[(int) float8_mul(pg_prng_double(&rstate), nsamples)];
+
+	for (int i = 0; i < nsamples; i++)
+		weights[i] = DBL_MAX;
+
+	/* calculate initial values for the other centers */
+	for (int i = 1; i < ncenters; i++)
+	{
+		float8		sum = 0.0;
+		float8		wi;
+
+		for (int j = 0; j < nsamples; j++)
+		{
+			float8		distance = euclidean_distance_internal(dimensions,
+															   centers[i - 1],
+															   vectors[j]);
+
+			weights[j] = float8_min(distance, weights[j]);
+			sum = float8_pl(sum, weights[j]);
+		}
+
+		wi = float8_mul(pg_prng_double(&rstate), sum);
+		for (int j = 0; j < nsamples; j++)
+		{
+			wi = float8_mi(wi, weights[j]);
+			if (float8_le(wi, 0))
+			{
+				centers[i] = vectors[j];
+				break;
+			}
+		}
+	}
+
+	/* XXX: exit early if converged */
+	for (int i = 0; i < iterations; i++)
+	{
+		/* assign each sample to a center */
+		for (int j = 0; j < nsamples; j++)
+		{
+			float8		min_distance = DBL_MAX;
+
+			for (int k = 0; k < ncenters; k++)
+			{
+				float8		distance = euclidean_distance_internal(dimensions,
+																   centers[k],
+																   vectors[j]);
+
+				if (float8_lt(distance, min_distance))
+				{
+					min_distance = distance;
+					clusters[j] = k;
+				}
+			}
+		}
+
+		/* clear centers */
+		for (int j = 0; j < ncenters; j++)
+		{
+			if (i == 0)
+				centers[j] = palloc0(dimensions * sizeof(float8));
+			else
+				memset(centers[j], 0, dimensions * sizeof(float8));
+
+			pops[j] = 0;
+		}
+
+		/* set each center to average of all vectors in cluster */
+		for (int j = 0; j < nsamples; j++)
+		{
+			int			cluster = clusters[j];
+			float8	   *center = centers[cluster];
+
+			for (int k = 0; k < dimensions; k++)
+				center[k] = float8_pl(center[k], vectors[j][k]);
+
+			pops[cluster]++;
+		}
+
+		for (int j = 0; j < ncenters; j++)
+		{
+			for (int k = 0; k < dimensions; k++)
+				centers[j][k] = float8_div(centers[j][k], pops[j]);
+		}
+	}
+
+	cdata = palloc(dimensions * sizeof(Datum));
+	for (int i = 0; i < ncenters; i++)
+	{
+		Datum		values[1];
+		bool		nulls[1];
+
+		for (int j = 0; j < dimensions; j++)
+			cdata[j] = Float8GetDatumFast(centers[i][j]);
+
+		values[0] = PointerGetDatum(construct_array(cdata, dimensions,
+													FLOAT8OID, sizeof(float8),
+													FLOAT8PASSBYVAL,
+													TYPALIGN_DOUBLE));
+		nulls[0] = false;
+
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+
+	return (Datum) 0;
+}
+
+/*
+ * Returns the index of the closest vector.
+ */
+Datum
+closest_vector(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(1);
+	int			alen = ARR_NDIM(a) > 1 ? ARR_DIMS(a)[0] : ARR_NDIM(a);
+	int			dimensions = float8_array_is_vector_internal(v, true);
+	float8		min_dist = DBL_MAX;
+	float8	   *vd = (float8 *) ARR_DATA_PTR(v);
+	int			idx = -1;
+
+	if (alen == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("must provide at least one candidate vector")));
+
+	if (ARR_HASNULL(a) && array_contains_nulls(a))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("vectors must not contain nulls")));
+
+	if ((ARR_NDIM(a) > 1 ? ARR_DIMS(a)[1] : ARR_DIMS(a)[0]) != dimensions)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("vectors must have the same number of elements")));
+
+	for (int i = 0; i < alen; i++)
+	{
+		float8	   *c;
+		float8		d;
+		int			r = -1;
+
+		if (pg_mul_s32_overflow(i, dimensions, &r))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("candidate vectors too large")));
+
+		c = ((float8 *) ARR_DATA_PTR(a)) + r;
+		d = euclidean_distance_internal(dimensions, c, vd);
+
+		if (r == -1 || float8_lt(d, min_dist))
+		{
+			min_dist = d;
+			idx = i;
+		}
+	}
+
+	PG_RETURN_INT32(idx);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index b2bc81b15f..a07c5b10b0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1735,6 +1735,55 @@
   proargtypes => 'internal oid internal int2 internal',
   prosrc => 'arraycontjoinsel' },
 
+{ oid => '4642', descr => 'verify float8 array is a vector',
+  proname => 'float8_array_is_vector', prorettype => 'bool',
+  proargtypes => '_float8',
+  prosrc => 'float8_array_is_vector' },
+{ oid => '4643', descr => 'euclidean norm',
+  proname => 'euclidean_norm', prorettype => 'float8',
+  proargtypes => '_float8',
+  prosrc => 'euclidean_norm' },
+{ oid => '4644', descr => 'dot product',
+  proname => 'dot_product', prorettype => 'float8',
+  proargtypes => '_float8 _float8',
+  prosrc => 'dot_product' },
+{ oid => '4645', descr => 'normalize a vector',
+  proname => 'normalize_vector', prorettype => '_float8',
+  proargtypes => '_float8',
+  prosrc => 'normalize_vector' },
+{ oid => '4646', descr => 'squared euclidean distance',
+  proname => 'squared_euclidean_distance', prorettype => 'float8',
+  proargtypes => '_float8 _float8',
+  prosrc => 'squared_euclidean_distance' },
+{ oid => '4647', descr => 'euclidean distance',
+  proname => 'euclidean_distance', prorettype => 'float8',
+  proargtypes => '_float8 _float8',
+  prosrc => 'euclidean_distance' },
+{ oid => '4648', descr => 'cosine distance',
+  proname => 'cosine_distance', prorettype => 'float8',
+  proargtypes => '_float8 _float8',
+  prosrc => 'cosine_distance' },
+{ oid => '4649', descr => 'taxicab distance',
+  proname => 'taxicab_distance', prorettype => 'float8',
+  proargtypes => '_float8 _float8',
+  prosrc => 'taxicab_distance' },
+{ oid => '4650', descr => 'chebyshev distance',
+  proname => 'chebyshev_distance', prorettype => 'float8',
+  proargtypes => '_float8 _float8',
+  prosrc => 'chebyshev_distance' },
+{ oid => '4651', descr => 'standard unit vector',
+  proname => 'standard_unit_vector', prorettype => '_float8',
+  proargtypes => 'int4 int4',
+  prosrc => 'standard_unit_vector' },
+{ oid => '4652', descr => 'kmeans',
+  proname => 'kmeans', prorettype => '_float8', proretset => 't',
+  proargtypes => 'int4 _float8 int4', prorows => '1000',
+  prosrc => 'kmeans' },
+{ oid => '4653', descr => 'closest vector',
+  proname => 'closest_vector', prorettype => 'int4',
+  proargtypes => '_float8 _float8',
+  prosrc => 'closest_vector' },
+
 { oid => '764', descr => 'large object import',
   proname => 'lo_import', provolatile => 'v', proparallel => 'u',
   prorettype => 'oid', proargtypes => 'text', prosrc => 'be_lo_import' },
diff --git a/src/test/regress/expected/vectors.out b/src/test/regress/expected/vectors.out
new file mode 100644
index 0000000000..846f02433c
--- /dev/null
+++ b/src/test/regress/expected/vectors.out
@@ -0,0 +1,154 @@
+SET extra_float_digits = -1;
+-- float8_array_is_vector
+SELECT float8_array_is_vector(ARRAY[]::float8[]);
+ float8_array_is_vector 
+------------------------
+ f
+(1 row)
+
+SELECT float8_array_is_vector('{{1.0},{1.0}}');
+ float8_array_is_vector 
+------------------------
+ f
+(1 row)
+
+SELECT float8_array_is_vector(ARRAY[NULL]::float8[]);
+ float8_array_is_vector 
+------------------------
+ f
+(1 row)
+
+SELECT float8_array_is_vector('{1,2,3}');
+ float8_array_is_vector 
+------------------------
+ t
+(1 row)
+
+-- euclidean_norm
+SELECT euclidean_norm('{1,2,3}');
+ euclidean_norm  
+-----------------
+ 3.7416573867739
+(1 row)
+
+-- normalize_vector
+SELECT normalize_vector('{0,0,0}');
+ERROR:  division by zero
+SELECT normalize_vector('{1,2,3}');
+                   normalize_vector                   
+------------------------------------------------------
+ {0.26726124191242,0.53452248382485,0.80178372573727}
+(1 row)
+
+-- dot_product
+SELECT dot_product('{1,2,3}', '{4,5,6}');
+ dot_product 
+-------------
+          32
+(1 row)
+
+-- squared_euclidean_distance
+SELECT squared_euclidean_distance('{1,2}', '{4,5,6}');
+ERROR:  vectors must have the same number of elements
+SELECT squared_euclidean_distance('{1,2,3}', '{4,5,6}');
+ squared_euclidean_distance 
+----------------------------
+                         27
+(1 row)
+
+-- euclidean_distance
+SELECT euclidean_distance('{1,2}', '{4,5,6}');
+ERROR:  vectors must have the same number of elements
+SELECT euclidean_distance('{1,2,3}', '{4,5,6}');
+ euclidean_distance 
+--------------------
+    5.1961524227066
+(1 row)
+
+-- cosine_distance
+SELECT cosine_distance('{1,2}', '{4,5,6}');
+ERROR:  vectors must have the same number of elements
+SELECT cosine_distance('{1,2,3}', '{4,5,6}');
+  cosine_distance  
+-------------------
+ 0.025368153802924
+(1 row)
+
+-- taxicab_distance
+SELECT taxicab_distance('{1,2}', '{4,5,6}');
+ERROR:  vectors must have the same number of elements
+SELECT taxicab_distance('{1,2,3}', '{4,5,6}');
+ taxicab_distance 
+------------------
+                9
+(1 row)
+
+-- chebyshev_distance
+SELECT chebyshev_distance('{1,2}', '{4,5,6}');
+ERROR:  vectors must have the same number of elements
+SELECT chebyshev_distance('{1,2,3}', '{4,5,6}');
+ chebyshev_distance 
+--------------------
+                  3
+(1 row)
+
+-- standard_unit_vector
+SELECT standard_unit_vector(0, 0);
+ERROR:  standard unit vectors must have at least one element
+SELECT standard_unit_vector(1, 0);
+ERROR:  cannot set nonexistent element to one
+SELECT standard_unit_vector(1, 2);
+ERROR:  cannot set nonexistent element to one
+SELECT standard_unit_vector(3, 1);
+ standard_unit_vector 
+----------------------
+ {1,0,0}
+(1 row)
+
+SELECT standard_unit_vector(3, 3);
+ standard_unit_vector 
+----------------------
+ {0,0,1}
+(1 row)
+
+-- kmeans
+SELECT * FROM kmeans(0, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1);
+ERROR:  must request at least one center
+SELECT * FROM kmeans(5, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1);
+ERROR:  number of sample vectors must be greater than or equal to the number of requested centers
+SELECT * FROM kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,NULL}}', 1);
+ERROR:  vectors must not contain nulls
+SELECT * FROM kmeans(2, ARRAY[[],[]]::float8[][], 1);
+ERROR:  number of sample vectors must be greater than or equal to the number of requested centers
+SELECT * FROM kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', -1);
+ERROR:  must request at least one iteration
+SELECT * FROM kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1) ORDER BY 1;
+     kmeans     
+----------------
+ {2.5,3.5,4.5}
+ {8.5,9.5,10.5}
+(2 rows)
+
+-- closest_vector
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', ARRAY[]::float8[]);
+ERROR:  vectors must have at least one element
+SELECT closest_vector(ARRAY[]::float8[], '{1,2,3}');
+ERROR:  must provide at least one candidate vector
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,NULL}}', '{1,2,3}');
+ERROR:  vectors must not contain nulls
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', '{1,2}');
+ERROR:  vectors must have the same number of elements
+SELECT closest_vector('{1,2,3}', '{1,2}');
+ERROR:  vectors must have the same number of elements
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', '{10,11,12}');
+ closest_vector 
+----------------
+              2
+(1 row)
+
+SELECT closest_vector('{1,2,3}', '{2,3,4}');
+ closest_vector 
+----------------
+              0
+(1 row)
+
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 3624035639..db453b4cdd 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -78,7 +78,7 @@ test: brin_bloom brin_multi
 # psql depends on create_am
 # amutils depends on geometry, create_index_spgist, hash_index, brin
 # ----------
-test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role
+test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role vectors
 
 # collate.*.utf8 tests cannot be run in parallel with each other
 test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252
diff --git a/src/test/regress/sql/vectors.sql b/src/test/regress/sql/vectors.sql
new file mode 100644
index 0000000000..c2f45fff61
--- /dev/null
+++ b/src/test/regress/sql/vectors.sql
@@ -0,0 +1,61 @@
+SET extra_float_digits = -1;
+
+-- float8_array_is_vector
+SELECT float8_array_is_vector(ARRAY[]::float8[]);
+SELECT float8_array_is_vector('{{1.0},{1.0}}');
+SELECT float8_array_is_vector(ARRAY[NULL]::float8[]);
+SELECT float8_array_is_vector('{1,2,3}');
+
+-- euclidean_norm
+SELECT euclidean_norm('{1,2,3}');
+
+-- normalize_vector
+SELECT normalize_vector('{0,0,0}');
+SELECT normalize_vector('{1,2,3}');
+
+-- dot_product
+SELECT dot_product('{1,2,3}', '{4,5,6}');
+
+-- squared_euclidean_distance
+SELECT squared_euclidean_distance('{1,2}', '{4,5,6}');
+SELECT squared_euclidean_distance('{1,2,3}', '{4,5,6}');
+
+-- euclidean_distance
+SELECT euclidean_distance('{1,2}', '{4,5,6}');
+SELECT euclidean_distance('{1,2,3}', '{4,5,6}');
+
+-- cosine_distance
+SELECT cosine_distance('{1,2}', '{4,5,6}');
+SELECT cosine_distance('{1,2,3}', '{4,5,6}');
+
+-- taxicab_distance
+SELECT taxicab_distance('{1,2}', '{4,5,6}');
+SELECT taxicab_distance('{1,2,3}', '{4,5,6}');
+
+-- chebyshev_distance
+SELECT chebyshev_distance('{1,2}', '{4,5,6}');
+SELECT chebyshev_distance('{1,2,3}', '{4,5,6}');
+
+-- standard_unit_vector
+SELECT standard_unit_vector(0, 0);
+SELECT standard_unit_vector(1, 0);
+SELECT standard_unit_vector(1, 2);
+SELECT standard_unit_vector(3, 1);
+SELECT standard_unit_vector(3, 3);
+
+-- kmeans
+SELECT * FROM kmeans(0, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1);
+SELECT * FROM kmeans(5, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1);
+SELECT * FROM kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,NULL}}', 1);
+SELECT * FROM kmeans(2, ARRAY[[],[]]::float8[][], 1);
+SELECT * FROM kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', -1);
+SELECT * FROM kmeans(2, '{{1,2,3},{4,5,6},{7,8,9},{10,11,12}}', 1) ORDER BY 1;
+
+-- closest_vector
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', ARRAY[]::float8[]);
+SELECT closest_vector(ARRAY[]::float8[], '{1,2,3}');
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,NULL}}', '{1,2,3}');
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', '{1,2}');
+SELECT closest_vector('{1,2,3}', '{1,2}');
+SELECT closest_vector('{{1,2,3},{4,5,6},{7,8,9}}', '{10,11,12}');
+SELECT closest_vector('{1,2,3}', '{2,3,4}');
-- 
2.25.1

