commit 5e861eff888c9557b9e40d418fa6d0839d2d886a Author: Joey Adams Date: Fri Aug 13 04:20:06 2010 -0400 Added several utility functions/macros to the backend * getEnumLabelOids: streamlined conversion of enum labels to OIDs * FN_EXTRA, FN_EXTRA_ALLOC, FN_MCXT: macros to cut down on boilerplate when working with fcinfo->flinfo->fn_mcxt * getTypeInfo: wrapper around get_type_io_data / fmgr_info_cxt / get_type_category_preferred that stores results in a structure called TypeInfo * pg_substring, pg_encoding_substring: slicing of multibyte-encoded strings * server_to_utf8, utf8_to_server: convenience routines for converting between the database encoding and UTF-8 * text_to_utf8_cstring, utf8_cstring_to_text, utf8_cstring_to_text_with_len: variants of text_to_cstring and company that also convert into and out of UTF-8 diff --git a/src/backend/utils/adt/enum.c b/src/backend/utils/adt/enum.c index 69562db..9259c99 100644 --- a/src/backend/utils/adt/enum.c +++ b/src/backend/utils/adt/enum.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "catalog/namespace.h" #include "catalog/pg_enum.h" #include "fmgr.h" #include "utils/array.h" @@ -25,6 +26,7 @@ static ArrayType *enum_range_internal(Oid enumtypoid, Oid lower, Oid upper); static int enum_elem_cmp(const void *left, const void *right); +static int enum_label_cmp(const void *left, const void *right); /* Basic I/O support */ @@ -412,6 +414,84 @@ enum_range_internal(Oid enumtypoid, Oid lower, Oid upper) return result; } +/* + * getEnumLabelOids + * Look up the OIDs of enum labels. Enum label OIDs are needed to + * return values of a custom enum type from a C function. + * + * Callers should typically cache the OIDs produced by this function + * using FN_EXTRA, as retrieving enum label OIDs is somewhat expensive. + * + * Every labels[i].index must be between 0 and count, and oid_out + * must be allocated to hold count items. Note that getEnumLabelOids + * sorts the labels[] array passed to it. + * + * Any labels not found in the enum will have their corresponding + * oid_out entries set to InvalidOid. + * + * Sample usage: + * + * -- SQL -- + * CREATE TYPE colors AS ENUM ('red', 'green', 'blue'); + * + * -- C -- + * enum Colors {RED, GREEN, BLUE, COLOR_COUNT}; + * + * static EnumLabel enum_labels[COLOR_COUNT] = + * { + * {RED, "red"}, + * {GREEN, "green"}, + * {BLUE, "blue"} + * }; + * + * Oid *label_oids = palloc(COLOR_COUNT * sizeof(Oid)); + * getEnumLabelOids("colors", enum_labels, label_oids, COLOR_COUNT); + * + * PG_RETURN_OID(label_oids[GREEN]); + */ +void +getEnumLabelOids(const char *typname, EnumLabel labels[], Oid oid_out[], int count) +{ + CatCList *list; + Oid enumtypoid; + int total; + int i; + EnumLabel key; + EnumLabel *found; + + enumtypoid = TypenameGetTypid(typname); + Assert(OidIsValid(enumtypoid)); + + qsort(labels, count, sizeof(EnumLabel), enum_label_cmp); + + for (i = 0; i < count; i++) + { + /* Initialize oid_out items to InvalidOid. */ + oid_out[i] = InvalidOid; + + /* Make sure EnumLabel indices are in range. */ + Assert(labels[i].index >= 0 && labels[i].index < count); + } + + list = SearchSysCacheList1(ENUMTYPOIDNAME, + ObjectIdGetDatum(enumtypoid)); + total = list->n_members; + + for (i = 0; i < total; i++) + { + HeapTuple tup = &list->members[i]->tuple; + Oid oid = HeapTupleGetOid(tup); + Form_pg_enum en = (Form_pg_enum) GETSTRUCT(tup); + + key.label = NameStr(en->enumlabel); + found = bsearch(&key, labels, count, sizeof(EnumLabel), enum_label_cmp); + if (found != NULL) + oid_out[found->index] = oid; + } + + ReleaseCatCacheList(list); +} + /* qsort comparison function for Datums that are OIDs */ static int enum_elem_cmp(const void *left, const void *right) @@ -425,3 +505,13 @@ enum_elem_cmp(const void *left, const void *right) return 1; return 0; } + +/* qsort comparison function for EnumLabel entries used by getEnumLabelOids */ +static int +enum_label_cmp(const void *left, const void *right) +{ + const char *l = ((EnumLabel *) left)->label; + const char *r = ((EnumLabel *) right)->label; + + return strcmp(l, r); +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 94766cd..c8f23d4 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -177,6 +177,75 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) pfree(srcunpacked); } +/* + * text_to_utf8_cstring + * + * Just like text_to_cstring, but yields a C string + * encoded in UTF-8 instead of the server encoding. + */ +char * +text_to_utf8_cstring(const text *t) +{ + /* must cast away the const, just like in text_to_cstring */ + text *tunpacked = pg_detoast_datum_packed((struct varlena *) t); + const char *data = VARDATA_ANY(tunpacked); + int len = VARSIZE_ANY_EXHDR(tunpacked); + char *result; + + result = server_to_utf8(data, len); + if (result == data) + result = pnstrdup(data, len); + + if (tunpacked != t) + pfree(tunpacked); + + return result; +} + +/* + * text_to_utf8_cstring + * + * Just like cstring_to_text, but takes a C string + * encoded in UTF-8 instead of the server encoding. + */ +text * +utf8_cstring_to_text(const char *s) +{ + return utf8_cstring_to_text_with_len(s, strlen(s)); +} + +/* + * utf8_cstring_to_text_with_len + * + * Just like cstring_to_text_with_len, but takes a C string + * encoded in UTF-8 instead of the server encoding. + * + * The input string should not contain null characters. + */ +text * +utf8_cstring_to_text_with_len(const char *s, int len) +{ + char *cstring; + int cstring_len; + text *result; + + cstring = utf8_to_server(s, len); + if (cstring == s) + cstring_len = len; + else + cstring_len = strlen(cstring); + + result = (text *) palloc(len + VARHDRSZ); + + SET_VARSIZE(result, len + VARHDRSZ); + memcpy(VARDATA(result), cstring, cstring_len); + + if (cstring != s) + pfree(cstring); + + return result; +} + /***************************************************************************** * USER I/O ROUTINES * diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 19a4a45..5b6b823 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -1844,6 +1844,37 @@ get_type_io_data(Oid typid, ReleaseSysCache(typeTuple); } +/* + * getTypeInfo + * Retrieve information about a type, along with either its + * input, output, binary receive, or binary send procedure. + * + * which_func should be one of: + * IOFunc_input + * IOFunc_output + * IOFunc_receive + * IOFunc_send + * + * mcxt is the memory context that the IO function will use to + * store subsidiary data. It should live at least as long as + * the TypeInfo structure. + */ +void +getTypeInfo(TypeInfo *d, Oid type, IOFuncSelector which_func, MemoryContext mcxt) +{ + d->type = type; + d->which_func = which_func; + d->mcxt = mcxt; + + get_type_io_data(type, which_func, + &d->typlen, &d->typbyval, &d->typalign, + &d->typdelim, &d->typioparam, &d->typiofunc); + fmgr_info_cxt(d->typiofunc, &d->proc, d->mcxt); + + get_type_category_preferred(type, + &d->typcategory, &d->typispreferred); +} + #ifdef NOT_USED char get_typalign(Oid typid) diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 0995a75..dfd4136 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -568,6 +568,36 @@ pg_server_to_client(const char *s, int len) } /* + * server_to_utf8, utf8_to_server + * Adaptations of pg_do_encoding_conversion for simplifying UTF-8 conversions. + * + * Sometimes, it makes more sense to operate primarily in UTF-8 rather than + * the server encoding. For instance, the JSON data type operates in UTF-8 + * because it needs to encode/decode individual characters when dealing with + * Unicode escapes, but there is no simple and efficient way to do that + * in the server encoding. + * + * Just like pg_do_encoding_conversion, if no conversion is done, + * the original pointer given is returned. + * + * These functions are no-ops when the server encoding is UTF-8. + */ + +char * +server_to_utf8(const char *s, int len) +{ + return (char *) pg_do_encoding_conversion( + (unsigned char *) s, len, GetDatabaseEncoding(), PG_UTF8); +} + +char * +utf8_to_server(const char *s, int len) +{ + return (char *) pg_do_encoding_conversion( + (unsigned char *) s, len, PG_UTF8, GetDatabaseEncoding()); +} + +/* * Perform default encoding conversion using cached FmgrInfo. Since * this function does not access database at all, it is safe to call * outside transactions. If the conversion has not been set up by diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 4b98c8b..8c56592 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1390,18 +1390,24 @@ pg_mic_mblen(const unsigned char *mbstr) return pg_mule_mblen(mbstr); } +static mblen_converter +encoding_mblen_converter(int encoding) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + pg_wchar_table[encoding].mblen : + pg_wchar_table[PG_SQL_ASCII].mblen); +} + /* * Returns the byte length of a multibyte character. */ int pg_encoding_mblen(int encoding, const char *mbstr) { - Assert(PG_VALID_ENCODING(encoding)); - - return ((encoding >= 0 && - encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); + return (*encoding_mblen_converter(encoding)) ((const unsigned char *) mbstr); } /* @@ -1641,4 +1647,120 @@ report_untranslatable_char(int src_encoding, int dest_encoding, pg_enc2name_tbl[dest_encoding].name))); } +/* + * pg_substring + * Find substring bounds in a string of a given encoding. + * + * The requested start and length are clipped to fit the string. + * + * src and srcbytes: input string slice + * start and length: start and number of characters requested + * out_start and out_bytes: substring slice + * out_length: number of characters in substring + * + * Unlike the SQL substring function, the start argument + * of this function is zero-based. + * + * Example (assume UTF-8 all around): + * const char *in = "⁰ ¹ ² ³"; // "\342\201\260 \302\271 \302\262 \302\263" + * const char *out_start; + * int out_bytes; + * int out_chars; + * + * pg_encoding_substring(in, strlen(in), + * 2, 100, + * &out_start, &out_bytes, &out_chars); + * + * out_start will point to the "¹", or "\302\271". + * out_bytes will be 8. + * out_chars will be 5. + */ +void +pg_substring(const char *src, int srcbytes, + int start, int length, + const char **out_start, int *out_bytes, int *out_length) +{ + pg_encoding_substring(GetDatabaseEncoding(), + src, srcbytes, + start, length, + out_start, out_bytes, out_length); +} + +/* + * pg_encoding_substring + * Find substring bounds in a string of a given encoding. + */ +void +pg_encoding_substring(int encoding, + const char *src, int srcbytes, + int start, int length, + const char **out_start, int *out_bytes, int *out_length) +{ + const char *e = src + srcbytes; + const char *sub_start; + const char *sub_end; + int sub_length; + mblen_converter mblen; + int len; + + if (start < 0) + { + length += start; + start = 0; + } + if (length < 0) + length = 0; + + /* optimization for single-byte encoding */ + if (pg_encoding_max_length(encoding) == 1) + { + *out_start = src + start; + *out_bytes = *out_length = Min(length, srcbytes - start); + return; + } + + /* + * Get the length callback once so it doesn't have to be looked up every + * time we call it. + */ + mblen = encoding_mblen_converter(encoding); + + /* Find the beginning of the substring. */ + sub_start = src; + while (start > 0 && sub_start < e) + { + len = (*mblen) ((const unsigned char *) sub_start); + + if (sub_start + len > e) + { + Assert(false); /* Clipped multibyte character */ + break; + } + + sub_start += len; + start--; + } + + /* Find the end and length of the substring. */ + sub_end = sub_start; + sub_length = 0; + while (sub_length < length && sub_end < e) + { + len = (*mblen) ((const unsigned char *) sub_end); + + if (sub_end + len > e) + { + Assert(false); /* Clipped multibyte character */ + break; + } + + sub_end += len; + sub_length++; + } + + *out_start = sub_start; + *out_bytes = sub_end - sub_start; + *out_length = sub_length; +} + #endif diff --git a/src/include/fmgr.h b/src/include/fmgr.h index c502b96..3d6d0c1 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -544,6 +544,40 @@ extern void **find_rendezvous_variable(const char *varName); extern int AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext); +/* + * FN_EXTRA, FN_EXTRA_ALLOC, FN_MCXT + * Macros for manipulating context preserved across function calls. + * + * FN_EXTRA is typically used for caching lookups and other nontrivial + * operations across multiple calls of a user-defined function. + * + * Do not use FN_EXTRA in a set-returning function. Use user_fctx instead. + * + * Typical usage looks like: + * + * my_extra = FN_EXTRA(); + * if (my_extra == NULL) + * { + * my_extra = FN_EXTRA_ALLOC(sizeof(MyExtra)); + * my_extra->type_name = NULL; + * } + * + * if (my_extra->type_name == NULL || + * strcmp(my_extra->type_name, type_name) != 0) + * { + * my_extra->type_name = MemoryContextStrdup(FN_MCXT(), type_name); + * my_extra->type_id = TypenameGetTypid(my_extra->type_name); + * } + */ +#define FN_EXTRA() (fcinfo->flinfo->fn_extra) +#define FN_EXTRA_ALLOC(size) \ + (fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, size)) + +/* + * Data allocated inside of FN_EXTRA() should be allocated into FN_MCXT() + * so it is preserved across calls + */ +#define FN_MCXT() (fcinfo->flinfo->fn_mcxt) /* * !!! OLD INTERFACE !!! diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 389be5c..2500b81 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -420,6 +420,9 @@ extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, extern char *pg_client_to_server(const char *s, int len); extern char *pg_server_to_client(const char *s, int len); +extern char *server_to_utf8(const char *s, int len); +extern char *utf8_to_server(const char *s, int len); + extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); @@ -466,4 +469,12 @@ extern bool pg_utf8_islegal(const unsigned char *source, int length); extern WCHAR *pgwin32_toUTF16(const char *str, int len, int *utf16len); #endif +void pg_substring(const char *src, int srcbytes, + int start, int length, + const char **out_start, int *out_bytes, int *out_length); +void pg_encoding_substring(int encoding, + const char *src, int srcbytes, + int start, int length, + const char **out_start, int *out_bytes, int *out_length); + #endif /* PG_WCHAR_H */ diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index a4c6180..427fbb3 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -17,6 +17,12 @@ #include "fmgr.h" #include "nodes/parsenodes.h" +typedef struct +{ + int index; + const char *label; +} EnumLabel; + /* * Defined in adt/ */ @@ -163,6 +169,7 @@ extern Datum enum_first(PG_FUNCTION_ARGS); extern Datum enum_last(PG_FUNCTION_ARGS); extern Datum enum_range_bounds(PG_FUNCTION_ARGS); extern Datum enum_range_all(PG_FUNCTION_ARGS); +void getEnumLabelOids(const char *typname, EnumLabel labels[], Oid oid_out[], int count); /* int.c */ extern Datum int2in(PG_FUNCTION_ARGS); @@ -675,6 +682,9 @@ extern text *cstring_to_text(const char *s); extern text *cstring_to_text_with_len(const char *s, int len); extern char *text_to_cstring(const text *t); extern void text_to_cstring_buffer(const text *src, char *dst, size_t dst_len); +extern char *text_to_utf8_cstring(const text *t); +extern text *utf8_cstring_to_text(const char *s); +extern text *utf8_cstring_to_text_with_len(const char *s, int len); #define CStringGetTextDatum(s) PointerGetDatum(cstring_to_text(s)) #define TextDatumGetCString(d) text_to_cstring((text *) DatumGetPointer(d)) diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 066ad76..7ec3e35 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -15,6 +15,7 @@ #include "access/attnum.h" #include "access/htup.h" +#include "fmgr.h" #include "nodes/pg_list.h" /* I/O function selector for get_type_io_data */ @@ -26,6 +27,24 @@ typedef enum IOFuncSelector IOFunc_send } IOFuncSelector; +typedef struct TypeInfo +{ + Oid type; + IOFuncSelector which_func; + MemoryContext mcxt; + + int16 typlen; + bool typbyval; + char typalign; + char typdelim; + Oid typioparam; + Oid typiofunc; + FmgrInfo proc; + + char typcategory; + bool typispreferred; +} TypeInfo; + /* Hook for plugins to get control in get_attavgwidth() */ typedef int32 (*get_attavgwidth_hook_type) (Oid relid, AttrNumber attnum); extern PGDLLIMPORT get_attavgwidth_hook_type get_attavgwidth_hook; @@ -106,6 +125,8 @@ extern void get_type_io_data(Oid typid, char *typdelim, Oid *typioparam, Oid *func); +void getTypeInfo(TypeInfo *d, Oid type, IOFuncSelector which_func, + MemoryContext mcxt); extern char get_typstorage(Oid typid); extern Node *get_typdefault(Oid typid); extern char get_typtype(Oid typid);