Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Started by Tom Turelinckxalmost 9 years ago14 messagesbugs
Jump to latest
#1Tom Turelinckx
tom@turelinckx.be

Hi,

I was trying to compile 9.4.12 on sparc from the debian source package in the pgdg repo, but it's failing multiple regression tests. One numeric test crashes the backend:

LOG: server process (PID 20659) was terminated by signal 10: Bus error
DETAIL: Failed process was running: SELECT '' AS to_char_6, to_char(val, 'FMS9999999999999999.999999999999999') FROM num_data;

Reading symbols from /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/src/backend/postgres...done.
[New LWP 20659]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/sparc-linux-gnu/libthread_db.so.1".
Core was generated by `postgres: turelto regression [local] SELECT '.
Program terminated with signal 10, Bus error.
#0 NUM_numpart_to_char (id=3, Np=0xffa04a54)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c:4419
4419 if (Np->Num->lsign == NUM_LSIGN_PRE)
(gdb) l
4414 (Np->num_curr >= Np->out_pre_spaces || (IS_ZERO(Np->Num) && Np->Num->zero_start == Np->num_curr)) &&
4415 (IS_PREDEC_SPACE(Np) == FALSE || (Np->last_relevant && *Np->last_relevant == '.')))
4416 {
4417 if (IS_LSIGN(Np->Num))
4418 {
4419 if (Np->Num->lsign == NUM_LSIGN_PRE)
4420 {
4421 if (Np->sign == '-')
4422 strcpy(Np->inout_p, Np->L_negative_sign);
4423 else
(gdb) bt full
#0 NUM_numpart_to_char (id=3, Np=0xffa04a54)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c:4419
end = <optimized out>
#1 NUM_processor (node=<optimized out>, Num=<optimized out>, inout=<optimized out>, number=<optimized out>,
from_char_input_len=0, to_char_out_pre_spaces=<optimized out>, sign=43, is_to_char=1 '\001',
collid=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c:4773
n = <optimized out>
_Np = {is_to_char = 1 '\001', Num = 0xffa04b18, sign = 43, sign_wrote = 0, num_count = 30, num_in = 0,
num_curr = 15, out_pre_spaces = 15, read_dec = 0, read_post = 0, read_pre = 0,
number = 0x61b440 "0.", '0' <repeats 15 times>, number_p = 0x61b440 "0.", '0' <repeats 15 times>,
inout = 0x61b1d4 "", inout_p = 0x61b1d4 "", last_relevant = 0x61b441 ".", '0' <repeats 15 times>,
L_negative_sign = 0x4c1438 "-", L_positive_sign = 0x3f5638 "+", decimal = 0x4cd530 ".",
L_thousands_sep = 0x49dd78 ",", L_currency_symbol = 0x4edd08 " "}
#2 0x002f2920 in numeric_to_char (fcinfo=0x618bd4)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c:5181
len = <optimized out>
value = <optimized out>
fmt = <optimized out>
Num = {pre = 16, post = 15, lsign = -1, flag = 98, pre_lsign_num = 0, multi = 0, zero_start = 0, zero_end = 0,
need_locale = 1}
format = 0x540d30
result = 0x61b1d0
shouldFree = <optimized out>
out_pre_spaces = <optimized out>
sign = 43
numstr = 0x61b440 "0.", '0' <repeats 15 times>
orgnum = <optimized out>
p = <optimized out>
x = <optimized out>
#3 0x001aa0f0 in ExecMakeFunctionResultNoSets (fcache=0x618b98, econtext=0x618ab8, isNull=0x619721 "",
isDone=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execQual.c:2026
arg = <optimized out>
result = <optimized out>
fcinfo = 0x618bd4
fcusage = {fs = 0x0, save_f_total_time = {tv_sec = 0, tv_usec = 0}, save_total = {tv_sec = 0, tv_usec = 0},
f_start = {tv_sec = 0, tv_usec = 0}}
i = <optimized out>
#4 0x001ae258 in ExecTargetList (isDone=0xffa04c84, itemIsDone=0x6197d8, isnull=0x619720 "", values=0x619710,
econtext=0x618ab8, targetlist=0x6197b0)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execQual.c:5334
gstate = <optimized out>
tle = <optimized out>
resind = <optimized out>
oldContext = 0x5f6640
tl = 0x6197c8
haveDoneSets = <optimized out>
#5 ExecProject (projInfo=<optimized out>, isDone=0xffa04c84)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execQual.c:5549
slot = 0x619018
econtext = 0x618ab8
numSimpleVars = <optimized out>
#6 0x001ae650 in ExecScan (node=0x618a30, accessMtd=0x1bfc80 <SeqNext>, recheckMtd=0x1bfc60 <SeqRecheck>)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execScan.c:207
slot = 0x619088
econtext = 0x618ab8
qual = 0x0
projInfo = 0x619730
isDone = ExprSingleResult
resultSlot = <optimized out>
#7 0x001a7650 in ExecProcNode (node=0x618a30)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execProcnode.c:400
result = <optimized out>
__func__ = "ExecProcNode"
#8 0x001a443c in ExecutePlan (dest=0x5d11f8, direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>,
operation=CMD_SELECT, planstate=0x618a30, estate=0x6189a8)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execMain.c:1490
slot = <optimized out>
current_tuple_count = 0
#9 standard_ExecutorRun (queryDesc=0x605f88, direction=<optimized out>, count=0)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/executor/execMain.c:319
estate = 0x6189a8
operation = CMD_SELECT
dest = 0x5d11f8
sendTuples = <optimized out>
oldcontext = 0x5f66f0
#10 0x002ac734 in PortalRunSelect (portal=0x5fb768, forward=1 '\001', count=0, dest=0x5d11f8)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/tcop/pquery.c:942
queryDesc = 0x605f88
direction = ForwardScanDirection
nprocessed = <optimized out>
__func__ = "PortalRunSelect"
#11 0x002adabc in PortalRun (portal=0x5fb768, count=2147483647, isTopLevel=1 '\001', dest=0x5d11f8, altdest=0x5d11f8,
completionTag=0xffa05040 "") at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/tcop/pquery.c:786
save_exception_stack = 0xffa04f84
save_context_stack = 0x0
local_sigjmp_buf = {{__jmpbuf = {-285518332, -285518572, 292667848}, __mask_was_saved = 0, __saved_mask = {
__val = {4288696024, 5679664, 16777218, 0, 13, 1076, 0, 0, 5611552, 5614592, 5459968, 5611520, 5397504,
120, 5681576, 1886539776, 6097184, 0, 0, 0, 0, 6097168, 4288696024, 2792652, 90, 6247456, 2, 1, 0, 1,
1024, 0}}}}
result = <optimized out>
nprocessed = <optimized out>
saveTopTransactionResourceOwner = 0x56aa30
---Type <return> to continue, or q <return> to quit---
saveTopTransactionContext = 0x56ab38
saveActivePortal = 0x0
saveResourceOwner = 0x56aa30
savePortalContext = 0x0
saveMemoryContext = 0x56ab38
__func__ = "PortalRun"
#12 0x002a9ba4 in exec_simple_query (
query_string=0x5cf4d8 "SELECT '' AS to_char_6, to_char(val, 'FMS", '9' <repeats 16 times>, ".", '9' <repeats 15 times>, "') FROM num_data;") at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/tcop/postgres.c:1072
parsetree = 0x5d00e8
portal = 0x5fb768
snapshot_set = <optimized out>
commandTag = <optimized out>
completionTag = "\000ELECT 10\000\000E", '\000' <repeats 35 times>, "\024\000\000\000\024\377\377\377\377\000R\\\304\000R\\P"
plantree_list = 0x5d11e0
receiver = 0x5d11f8
format = 0
dest = DestRemote
parsetree_list = 0x5d0180
save_log_statement_stats = 0 '\000'
was_logged = 0 '\000'
msec_str = "\000\240P\250\000\006\004\320\000\000\000\000\000X\324d0\n\000\034", '\000' <repeats 11 times>, "\002"
parsetree_item = 0x5d0170
isTopLevel = <optimized out>
#13 PostgresMain (argc=<optimized out>, argv=<optimized out>, dbname=0x5694a0 "regression", username=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/tcop/postgres.c:4100
query_string = 0x5cf4d8 "SELECT '' AS to_char_6, to_char(val, 'FMS", '9' <repeats 16 times>, ".", '9' <repeats 15 times>, "') FROM num_data;"
firstchar = 6095080
input_message = {
data = 0x5cf4d8 "SELECT '' AS to_char_6, to_char(val, 'FMS", '9' <repeats 16 times>, ".", '9' <repeats 15 times>, "') FROM num_data;", len = 95, maxlen = 1024, cursor = 95}
local_sigjmp_buf = {{__jmpbuf = {-285518572, -285521052, 292654784}, __mask_was_saved = 1, __saved_mask = {
__val = {0, 0, 0, 0, 1882663060, 1879054856, 1881810088, 0, 1, 0, 0, 0, 1881097612, 4, 0, 0, 0, 0, 0, 0,
1886539776, 1886551248, 1886551248, 5821840, 0, 0, 0, 4288696392, 1885505112, 4288696368, 2364184,
0}}}}
send_ready_for_query = 0 '\000'
__func__ = "PostgresMain"
#14 0x000606b0 in BackendRun (port=0x58d3d8)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/postmaster/postmaster.c:4301
ac = 1
secs = 551365461
usecs = 297758
i = 1
av = 0x569610
maxac = <optimized out>
#15 BackendStartup (port=0x58d3d8)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/postmaster/postmaster.c:3964
bn = 0x10c3
pid = 0
#16 ServerLoop () at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/postmaster/postmaster.c:1694
port = 0x58d3d8
rmask = {fds_bits = {16, 0 <repeats 31 times>}}
selres = <optimized out>
now = <optimized out>
readmask = {fds_bits = {16, 0 <repeats 31 times>}}
nSockets = 5
last_lockfile_recheck_time = 1498050259
last_touch_time = 1498050259
__func__ = "ServerLoop"
#17 0x0024d334 in PostmasterMain (argc=8, argv=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/postmaster/postmaster.c:1302
opt = <optimized out>
status = <optimized out>
userDoption = <optimized out>
listen_addr_saved = <optimized out>
i = <optimized out>
output_config_variable = <optimized out>
__func__ = "PostmasterMain"
#18 0x000616f0 in main (argc=8, argv=0x568b88)
at /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/main/main.c:233
No locals.

It seems this issue was introduced in 9.4.9: 9.4.9, 9.4.10, 9.4.11 and 9.4.12 all crash at the same test at the same line in formatting.c, but 9.4.8 builds successfully and passes all tests. It also seems that both NUM_numpart_to_char and the failing test have not been touched for, eh, decades, so the root cause must be somewhere else.

The same issue is present in 9.6.3:

LOG: server process (PID 10632) was terminated by signal 10: Bus error
DETAIL: Failed process was running: SELECT '' AS to_char_6, to_char(val, 'FMS9999999999999999.999999999999999') FROM num_data;

Reading symbols from /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/src/backend/postgres...done.
[New LWP 10632]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/sparc-linux-gnu/libthread_db.so.1".
Core was generated by `postgres: turelto regression [local] SELECT '.
Program terminated with signal 10, Bus error.
#0 NUM_numpart_to_char (id=3, Np=0xff924b5c)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/utils/adt/formatting.c:4434
4434 if (Np->Num->lsign == NUM_LSIGN_PRE)
(gdb) l
4429 (Np->num_curr >= Np->out_pre_spaces || (IS_ZERO(Np->Num) && Np->Num->zero_start == Np->num_curr)) &&
4430 (IS_PREDEC_SPACE(Np) == FALSE || (Np->last_relevant && *Np->last_relevant == '.')))
4431 {
4432 if (IS_LSIGN(Np->Num))
4433 {
4434 if (Np->Num->lsign == NUM_LSIGN_PRE)
4435 {
4436 if (Np->sign == '-')
4437 strcpy(Np->inout_p, Np->L_negative_sign);
4438 else
(gdb) bt full
#0 NUM_numpart_to_char (id=3, Np=0xff924b5c)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/utils/adt/formatting.c:4434
end = <optimized out>
#1 NUM_processor (node=<optimized out>, Num=<optimized out>, inout=<optimized out>, number=<optimized out>,
from_char_input_len=0, to_char_out_pre_spaces=<optimized out>, sign=43, is_to_char=1 '\001',
collid=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/utils/adt/formatting.c:4788
n = <optimized out>
_Np = {is_to_char = 1 '\001', Num = 0xff924c20, sign = 43, sign_wrote = 0, num_count = 30, num_in = 0,
num_curr = 15, out_pre_spaces = 15, read_dec = 0, read_post = 0, read_pre = 0,
number = 0x6bdac0 "0.", '0' <repeats 15 times>, number_p = 0x6bdac0 "0.", '0' <repeats 15 times>,
inout = 0x6bd854 "", inout_p = 0x6bd854 "", last_relevant = 0x6bdac1 ".", '0' <repeats 15 times>,
L_negative_sign = 0x555908 "-", L_positive_sign = 0x4789d8 "+", decimal = 0x565d10 ".",
L_thousands_sep = 0x52ed18 ",", L_currency_symbol = 0x588920 " "}
#2 0x00354920 in numeric_to_char (fcinfo=0x6bbae4)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/utils/adt/formatting.c:5213
len = <optimized out>
value = <optimized out>
fmt = <optimized out>
Num = {pre = 16, post = 15, lsign = -1, flag = 98, pre_lsign_num = 0, multi = 0, zero_start = 0, zero_end = 0,
need_locale = 1}
format = 0x5dc09c
result = 0x6bd850
shouldFree = <optimized out>
out_pre_spaces = <optimized out>
sign = 43
numstr = 0x6bdac0 "0.", '0' <repeats 15 times>
orgnum = <optimized out>
p = <optimized out>
x = <optimized out>
#3 0x001e16b0 in ExecMakeFunctionResultNoSets (fcache=0x6bbaa8, econtext=0x6bb9c8, isNull=0x6bc229 "",
isDone=<optimized out>) at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execQual.c:2041
arg = <optimized out>
result = <optimized out>
fcinfo = 0x6bbae4
fcusage = {fs = 0x0, save_f_total_time = {tv_sec = 1, tv_usec = 0}, save_total = {tv_sec = 0, tv_usec = 0},
f_start = {tv_sec = 0, tv_usec = 0}}
i = <optimized out>
#4 0x001e5840 in ExecTargetList (isDone=0xff924d8c, itemIsDone=0x6bc2e0, isnull=0x6bc228 "", values=0x6bc218,
econtext=0x6bb9c8, tupdesc=<optimized out>, targetlist=0x6bc2b8)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execQual.c:5423
gstate = <optimized out>
tle = <optimized out>
resind = <optimized out>
att = 0x6bc02c
oldContext = 0x655430
tl = 0x6bc2d0
haveDoneSets = <optimized out>
#5 ExecProject (projInfo=<optimized out>, isDone=0xff924d8c)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execQual.c:5647
slot = 0x6bbf28
econtext = 0x6bb9c8
numSimpleVars = <optimized out>
#6 0x001e5c84 in ExecScan (node=0x6bb940, accessMtd=0x1faec0 <SeqNext>, recheckMtd=0x1faea0 <SeqRecheck>)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execScan.c:220
slot = 0x6bbf98
econtext = 0x6bb9c8
qual = 0x0
projInfo = 0x6bc238
isDone = ExprSingleResult
resultSlot = <optimized out>
#7 0x001de620 in ExecProcNode (node=0x6bb940)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execProcnode.c:419
result = <optimized out>
__func__ = "ExecProcNode"
#8 0x001da304 in ExecutePlan (dest=0x69c620, direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>,
operation=CMD_SELECT, use_parallel_mode=<optimized out>, planstate=0x6bb940, estate=0x6bb838)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execMain.c:1569
slot = <optimized out>
current_tuple_count = 0
#9 standard_ExecutorRun (queryDesc=0x6a61f8, direction=<optimized out>, count=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/executor/execMain.c:338
estate = 0x6bb838
operation = CMD_SELECT
dest = 0x69c620
sendTuples = <optimized out>
oldcontext = 0x6549b0
#10 0x0030606c in PortalRunSelect (portal=0x6233a0, forward=<optimized out>, count=0, dest=0x69c620)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/tcop/pquery.c:948
queryDesc = 0x6a61f8
direction = <optimized out>
nprocessed = <optimized out>
__func__ = "PortalRunSelect"
#11 0x0030747c in PortalRun (portal=0x6233a0, count=2147483647, isTopLevel=1 '\001', dest=0x69c620, altdest=0x69c620,
completionTag=0xff925160 "") at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/tcop/pquery.c:789
save_exception_stack = 0xff9250a4
save_context_stack = 0x0
local_sigjmp_buf = {{__jmpbuf = {-488613709, -488613485, 490890671}, __mask_was_saved = 0, __saved_mask = {
__val = {16797688, 6621224, 2, 0, 6165432, 931, 0, 0, 6165432, 6168576, 6094848, 6165504, 6053888, 120,
6320304, 1886343168, 6812424, 256, 0, 0, 0, 6812408, 4287778808, 3159308, 97, 1130710192, 2, 1, 0, 1,
1024, 0}}}}
result = <optimized out>
nprocessed = <optimized out>
saveTopTransactionResourceOwner = 0x650828
saveTopTransactionContext = 0x655298
saveActivePortal = 0x0
saveResourceOwner = 0x650828
savePortalContext = 0x0
saveMemoryContext = 0x655298
__func__ = "PortalRun"
#12 0x003033e4 in exec_simple_query (
query_string=0x67dec0 "SELECT '' AS to_char_6, to_char(val, 'FMS", '9' <repeats 16 times>, ".", '9' <repeats 15 times>, "') FROM num_data;") at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/tcop/postgres.c:1094
parsetree = 0x67ead0
portal = 0x6233a0
snapshot_set = <optimized out>
commandTag = <optimized out>
completionTag = "\000ELECT 10\000\000E", '\000' <repeats 35 times>, "\024\000\000\000\024\377\377\377\377\000]\000\364\000\\\377", <incomplete sequence \364>
plantree_list = 0x69c608
receiver = 0x69c620
format = 0
dest = DestRemote
parsetree_list = 0x67eb68
save_log_statement_stats = 0 '\000'
was_logged = 0 '\000'
msec_str = "\000\222Q\310\000\006\324\344\000bs\244\000bt$\000\000\000\034\000\000\000 \000bG0\000\000\000"
parsetree_item = 0x67eb58
isTopLevel = <optimized out>
#13 PostgresMain (argc=<optimized out>, argv=<optimized out>, dbname=0x606c70 "regression", username=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/tcop/postgres.c:4076
query_string = 0x67dec0 "SELECT '' AS to_char_6, to_char(val, 'FMS", '9' <repeats 16 times>, ".", '9' <repeats 15 times>, "') FROM num_data;"
firstchar = 6810320
input_message = {
data = 0x67dec0 "SELECT '' AS to_char_6, to_char(val, 'FMS", '9' <repeats 16 times>, ".", '9' <repeats 15 times>, "') FROM num_data;", len = 95, maxlen = 1024, cursor = 95}
local_sigjmp_buf = {{__jmpbuf = {-488613485, -488618077, 490879835}, __mask_was_saved = 1, __saved_mask = {
__val = {0, 0, 0, 0, 1882908820, 4287779072, 1879209388, 0, 0, 0, 1882908820, 1879054832, 1882055848, 4,
0, 0, 0, 0, 4287779152, 0, 0, 0, 0, 0, 0, 0, 0, 1498055587, 0, 0, 4294967295, 6052864}}}}
send_ready_for_query = 0 '\000'
disable_idle_in_transaction_timeout = 0 '\000'
__func__ = "PostgresMain"
#14 0x0006d6c8 in BackendRun (port=0x627398)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/postmaster/postmaster.c:4285
ac = 1
secs = 551370787
usecs = 396920
i = 1
av = 0x62da48
maxac = <optimized out>
#15 BackendStartup (port=0x627398)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/postmaster/postmaster.c:3959
bn = 0x10b3
pid = 0
#16 ServerLoop () at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/postmaster/postmaster.c:1715
port = 0x627398
rmask = {fds_bits = {16, 0 <repeats 31 times>}}
selres = <optimized out>
now = <optimized out>
readmask = {fds_bits = {16, 0 <repeats 31 times>}}
nSockets = 5
last_lockfile_recheck_time = 1498055585
last_touch_time = 1498055585
__func__ = "ServerLoop"
#17 0x0029ba3c in PostmasterMain (argc=8, argv=<optimized out>)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/postmaster/postmaster.c:1323
opt = <optimized out>
status = <optimized out>
userDoption = <optimized out>
listen_addr_saved = <optimized out>
i = <optimized out>
output_config_variable = <optimized out>
__func__ = "PostmasterMain"
#18 0x0006e6d8 in main (argc=8, argv=0x604af8)
at /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/backend/main/main.c:228
No locals.

A potentially similar issue was introduced in 9.4.7 and resolved in 9.4.8:

/messages/by-id/20160413094117.GC21485@msg.credativ.de

I can test patches or provide more information.

Best regards,
Tom Turelinckx

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#2Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Tom Turelinckx (#1)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Tom Turelinckx wrote:

It seems this issue was introduced in 9.4.9: 9.4.9, 9.4.10, 9.4.11 and
9.4.12 all crash at the same test at the same line in formatting.c,
but 9.4.8 builds successfully and passes all tests. It also seems that
both NUM_numpart_to_char and the failing test have not been touched
for, eh, decades, so the root cause must be somewhere else.

You're probably misreading the git log, because this code was touched
just before 9.4.9. See commit
https://git.postgresql.org/pg/commitdiff/20f870fd7cab8446c208a4a9cfa5ec2a441ef69c

--
�lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#3Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Turelinckx (#1)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

"Tom Turelinckx" <tom@turelinckx.be> writes:

I was trying to compile 9.4.12 on sparc from the debian source package

in the pgdg repo, but it's failing multiple regression tests.

Hm. What else fails besides the crash you're showing? It's not very
easy to deduce what's wrong there, but maybe some other symptom would
be more transparent.

It seems this issue was introduced in 9.4.9: 9.4.9, 9.4.10, 9.4.11 and 9.4.12 all crash at the same test at the same line in formatting.c, but 9.4.8 builds successfully and passes all tests. It also seems that both NUM_numpart_to_char and the failing test have not been touched for, eh, decades, so the root cause must be somewhere else.

Alvaro's right that there was a change in 9.4.9 in formatting.c, but
that seems unrelated. And if you're seeing issues elsewhere, a more
global root cause seems what to postulate.

A potentially similar issue was introduced in 9.4.7 and resolved in 9.4.8:
/messages/by-id/20160413094117.GC21485@msg.credativ.de

We never did get a clear explanation of why that crashed on Sparc.
I hypothesized over-aggressive compiler assumptions about alignment,
but there was no convincing evidence for that. It might be worth taking
a look at the assembly code immediately around the crash point, especially
if you could also get the corresponding code from 9.4.8.

regards, tom lane

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#4Tom Turelinckx
tom@turelinckx.be
In reply to: Alvaro Herrera (#2)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Alvaro Herrera wrote:

You're probably misreading the git log, because this code was touched
just before 9.4.9. See commit
https://git.postgresql.org/pg/commitdiff/20f870fd7cab8446c208a4a9cfa5ec2a441ef69c

Thanks for the pointer! When I reverse patch that commit against 9.4.12 it builds successfully and passes all tests.

Against the original 9.4.12, only the numeric and sanity check tests fail. Depending on the run, various other tests appear to fail, but those failures are caused by the numeric test crashing the backend and the tests being run in parallel:

WARNING: terminating connection because of crash of another server process
DETAIL: The postmaster has commanded this server process to roll back the current transaction and exit, because another server process exited abnormally and possibly corrupted shared memory.

I've verified that 10~beta1 also fails the numeric test:

Reading symbols from /home/turelto/src/original/postgresql-10-10~beta1/build/src/backend/postgres...done.
[New LWP 20199]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/sparc-linux-gnu/libthread_db.so.1".
Core was generated by `postgres: turelto regression [local] SELECT '.
Program terminated with signal 10, Bus error.
#0 NUM_numpart_to_char (id=3, Np=0xffd22aa4)
at /home/turelto/src/original/postgresql-10-10~beta1/build/../src/backend/utils/adt/formatting.c:4471
4471 if (Np->Num->lsign == NUM_LSIGN_PRE)
(gdb) l
4466 (Np->num_curr >= Np->out_pre_spaces || (IS_ZERO(Np->Num) && Np->Num->zero_start == Np->num_curr)) &&
4467 (IS_PREDEC_SPACE(Np) == FALSE || (Np->last_relevant && *Np->last_relevant == '.')))
4468 {
4469 if (IS_LSIGN(Np->Num))
4470 {
4471 if (Np->Num->lsign == NUM_LSIGN_PRE)
4472 {
4473 if (Np->sign == '-')
4474 strcpy(Np->inout_p, Np->L_negative_sign);
4475 else

When I reverse patch the above commit against 10~beta1 it also builds successfully and passes all tests.

When I reverse patch the above commit against 9.6.3 it passes the numeric test, but (still) fails the object_address test:

******** build/src/test/regress/regression.diffs ********
*** /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/../src/test/regress/expected/object_address.out    2017-05-08 23:15:12.000000000 +0200
--- /home/turelto/src/tmp/postgresql-9.6-9.6.3/build/src/test/regress/results/object_address.out        2017-06-22 11:11:23.000000000 +0200
***************
*** 263,273 ****
  WARNING:  error for policy,{eins,zwei,drei},{}: schema "eins" does not exist
  WARNING:  error for policy,{eins,zwei,drei},{integer}: schema "eins" does not exist
  WARNING:  error for user mapping,{eins},{}: argument list length must be exactly 1
! WARNING:  error for user mapping,{eins},{integer}: user mapping for user "eins" on server "integer" does not exist
  WARNING:  error for user mapping,{addr_nsp,zwei},{}: argument list length must be exactly 1
! WARNING:  error for user mapping,{addr_nsp,zwei},{integer}: user mapping for user "addr_nsp" on server "integer" does not exist
  WARNING:  error for user mapping,{eins,zwei,drei},{}: argument list length must be exactly 1
! WARNING:  error for user mapping,{eins,zwei,drei},{integer}: user mapping for user "eins" on server "integer" does not exist
  WARNING:  error for default acl,{eins},{}: argument list length must be exactly 1
  WARNING:  error for default acl,{eins},{integer}: unrecognized default ACL object type i
  WARNING:  error for default acl,{addr_nsp,zwei},{}: argument list length must be exactly 1
--- 263,273 ----
  WARNING:  error for policy,{eins,zwei,drei},{}: schema "eins" does not exist
  WARNING:  error for policy,{eins,zwei,drei},{integer}: schema "eins" does not exist
  WARNING:  error for user mapping,{eins},{}: argument list length must be exactly 1
! WARNING:  error for user mapping,{eins},{integer}: user mapping for user "(null)" on server "integer" does not exist
  WARNING:  error for user mapping,{addr_nsp,zwei},{}: argument list length must be exactly 1
! WARNING:  error for user mapping,{addr_nsp,zwei},{integer}: user mapping for user "(null)" on server "integer" does not exist
  WARNING:  error for user mapping,{eins,zwei,drei},{}: argument list length must be exactly 1
! WARNING:  error for user mapping,{eins,zwei,drei},{integer}: user mapping for user "(null)" on server "integer" does not exist
  WARNING:  error for default acl,{eins},{}: argument list length must be exactly 1
  WARNING:  error for default acl,{eins},{integer}: unrecognized default ACL object type i
  WARNING:  error for default acl,{addr_nsp,zwei},{}: argument list length must be exactly 1

======================================================================

The failing user mapping-related tests were introduced in commit 890192e (which is not in 9.4), and may have been resolved by commit 8b6d6cf (which is in 10~beta1).

Best regards,
Tom Turelinckx

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#5Tom Turelinckx
tom@turelinckx.be
In reply to: Tom Lane (#3)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Tom Lane wrote:

A potentially similar issue was introduced in 9.4.7 and resolved in 9.4.8:
/messages/by-id/20160413094117.GC21485@msg.creda
tiv.de

We never did get a clear explanation of why that crashed on Sparc.
I hypothesized over-aggressive compiler assumptions about alignment, but there was no convincing evidence for that. It might be worth taking a look at the assembly code immediately around the crash point, especially if you could also get the corresponding code from 9.4.8.

I've already built 9.4.8, passing all tests. I can try to build 9.4.7 to reproduce that crash. I can also try to build 9.4.7 with just the relevant commit (0045691) applied to resolve that crash.

But I need more information / pointers to documentation on how to find and provide the information you need, as I have no experience with looking at assembly code.

Best regards,
Tom Turelinckx

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#6Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Turelinckx (#5)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

"Tom Turelinckx" <tom@turelinckx.be> writes:

But I need more information / pointers to documentation on how to find and provide the information you need, as I have no experience with looking at assembly code.

The way to get an assembly code file is to substitute -S for -c in the
compile command, and also remove any "-o file" option. So on my machine,
in an already built PG tree, I check what switches we're using:

$ cd src/backend/utils/adt
$ rm formatting.o
$ make formatting.o
gcc -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -g -O2 -I../../../../src/include -D_GNU_SOURCE -c -o formatting.o formatting.c

Now I copy-and-paste all the switches except -c and -o:

$ gcc -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -g -O2 -I../../../../src/include -D_GNU_SOURCE -S formatting.c

Note it's important that -g be one of the switches, else you don't get
line number annotations in the assembly.

Now I have a formatting.s file with contents like

.L275:
.LBE196:
.LBB197:
.LBB193:
.loc 1 2320 0
leal -1(%rcx), %edi
cmpl $1, %edi
jbe .L278
.loc 1 2322 0
cmpl $3, %ecx
.p2align 4,,2
je .L279
.LVL216:
.L246:
.loc 1 2325 0
movq (%rdx), %rax
testq %rax, %rax
je .L244
.loc 1 2331 0
leaq 1(%r14), %rdi
movq %r14, 48(%rsp)
movq %rbx, 64(%rsp)
.loc 1 2325 0
xorl %r10d, %r10d
.loc 1 2331 0
movq %rdx, %rbx

The important part of this for your purposes is the ".loc" annotations,
which indicate the source line number the following code was generated
from. Notice that's not unusual for the compiler to rearrange code so
that instructions from different lines are interspersed --- here we
can see that lines 2325 and 2331 got mingled together. So there might
not be only one .loc annotation for the line where the crash is being
reported. Anyway, find those annotation(s) and send us all the text
for that area and maybe a few dozen lines on either side.

regards, tom lane

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#7Tom Turelinckx
tom@turelinckx.be
In reply to: Tom Lane (#6)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Tom Lane wrote:

Anyway, find those annotation(s) and send us all the text for that area and maybe a few dozen lines on either side.

Summary for the current issue, against 9.4.12.

Clean pgdg 9.4.12 fails:

regression.out:

numeric ... FAILED (test process exited with exit code 2)

regression.diffs:

SELECT '' AS to_char_6, to_char(val, 'FMS9999999999999999.999999999999999') FROM num_data;
! server closed the connection unexpectedly
! This probably means the server terminated abnormally
! before or while processing the request.
! connection to server was lost

postmaster.log:

LOG: server process (PID 15999) was terminated by signal 10: Bus error
DETAIL: Failed process was running: SELECT '' AS to_char_6, to_char(val, 'FMS9999999999999999.999999999999999') FROM num_data;

Crash location:

Reading symbols from /home/turelto/src/original/postgresql-9.4-9.4.12/build/src/backend/postgres...done.
[New LWP 15999]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/sparc-linux-gnu/libthread_db.so.1".
Core was generated by `postgres: turelto regression [local] SELECT '.
Program terminated with signal 10, Bus error.
#0 NUM_numpart_to_char (id=3, Np=0xff852a34)
at /home/turelto/src/original/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c:4419
4419 if (Np->Num->lsign == NUM_LSIGN_PRE)
(gdb) l
4414 (Np->num_curr >= Np->out_pre_spaces || (IS_ZERO(Np->Num) && Np->Num->zero_start == Np->num_curr)) &&
4415 (IS_PREDEC_SPACE(Np) == FALSE || (Np->last_relevant && *Np->last_relevant == '.')))
4416 {
4417 if (IS_LSIGN(Np->Num))
4418 {
4419 if (Np->Num->lsign == NUM_LSIGN_PRE)
4420 {
4421 if (Np->sign == '-')
4422 strcpy(Np->inout_p, Np->L_negative_sign);
4423 else
(gdb)

Assembly snippet attached as "original-9.4.12-snippet.s" from file generated with:

gcc -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -g -g -O2 -fstack-protector --param=ssp-buffer-size=4 -Wformat -Werror=format-security -I/usr/include/mit-krb5 -DLINUX_OOM_SCORE_ADJ=0 -I../../../../src/include -I/home/turelto/src/original/postgresql-9.4-9.4.12/build/../src/include -D_FORTIFY_SOURCE=2 -D_GNU_SOURCE -I/usr/include/libxml2 -I/usr/include/tcl8.5 -S /home/turelto/src/original/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c

Patched pgdg 9.4.12 passes all tests.

Applied patch attached as "patch" (reverts commit 20f870f):

Index: postgresql-9.4-9.4.12/src/backend/utils/adt/formatting.c
===================================================================
--- postgresql-9.4-9.4.12.orig/src/backend/utils/adt/formatting.c	2017-05-08 23:19:04.000000000 +0200
+++ postgresql-9.4-9.4.12/src/backend/utils/adt/formatting.c	2017-06-22 09:20:17.000000000 +0200
@@ -4173,12 +4173,12 @@
 		 (id == NUM_0 || id == NUM_9) ? "NUM_0/9" : id == NUM_DEC ? "NUM_DEC" : "???");
 #endif
+	if (*Np->inout_p == ' ')
+		Np->inout_p++;
+
 #define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
 #define AMOUNT_TEST(_s) (input_len-(Np->inout_p-Np->inout) >= _s)

- if (OVERLOAD_TEST)
- return;
-
if (*Np->inout_p == ' ')
Np->inout_p++;

@@ -4316,7 +4316,7 @@
 		 * next char is not digit
 		 */
 		if (IS_LSIGN(Np->Num) && isread &&
-			(Np->inout_p + 1) < Np->inout + input_len &&
+			(Np->inout_p + 1) <= Np->inout + input_len &&
 			!isdigit((unsigned char) *(Np->inout_p + 1)))
 		{
 			int			x;

Assembly snippet attached as "patched-9.4.12-snippet.s" from file generated with:

gcc -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -g -g -O2 -fstack-protector --param=ssp-buffer-size=4 -Wformat -Werror=format-security -I/usr/include/mit-krb5 -DLINUX_OOM_SCORE_ADJ=0 -I../../../../src/include -I/home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/include -D_FORTIFY_SOURCE=2 -D_GNU_SOURCE -I/usr/include/libxml2 -I/usr/include/tcl8.5 -S /home/turelto/src/tmp/postgresql-9.4-9.4.12/build/../src/backend/utils/adt/formatting.c

The line number where the crash is being reported before patching (4419) doesn't change after patching, and occurs in only one ".loc" annotation, before and after patching.

I will try to do the same for the issue from last year, replying to that thread.

Best regards,
Tom Turelinckx

Attachments:

original-9.4.12-snippet.sapplication/octet-stream; name=original-9.4.12-snippet.sDownload
patchapplication/octet-stream; name=patchDownload+4-4
patched-9.4.12-snippet.sapplication/octet-stream; name=patched-9.4.12-snippet.sDownload
#8Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Turelinckx (#7)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

"Tom Turelinckx" <tom@turelinckx.be> writes:

Tom Lane wrote:

Anyway, find those annotation(s) and send us all the text for that area and maybe a few dozen lines on either side.

Summary for the current issue, against 9.4.12.

Hm. The code seems about the same except that the compiler has changed
a few register assignments. In both cases, the crash has to be coming
from the load instruction in line 4419, unless gdb is totally lying to
us about where the crash is. And that register was loaded up at line
4389:

.loc 1 4389 0
ld [%fp-80], %g3
ld [%g3+12], %g2
...
.loc 1 4419 0
ld [%g3+8], %g2
cmp %g2, -1

(The "patched" code uses %g4 instead, but otherwise is the same.)
Now, the value loaded has to be a valid pointer, because the load
from [%g3+12] didn't crash. And nothing in the straight-line
code sequence changed %g3. But we aren't seeing all of the complicated
if-test at line 4413. It looks to me like the compiler has put some
of it out-of-line, at labels .LL697 and .LL726 (or .LL692 and .LL693
in the "patched" assembly). I am guessing that some part of those
code sequences must be trashing %g3 before jumping back to this
sequence at .LL553 or .LL715.

Maybe you could extract those bits too? Or if you prefer, just send me
the whole .s files off-list.

regards, tom lane

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#9Tom Lane
tgl@sss.pgh.pa.us
In reply to: Tom Turelinckx (#1)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

"Tom Turelinckx" <tom@turelinckx.be> writes:

Maybe you could extract those bits too? Or if you prefer, just send me the whole .s files off-list.

Whole .s files attached.

Thanks. The short of it seems to be that this is a compiler bug.
In the "original" code, the out-of-line segment at .LL726 is

.loc 1 4415 0
ldsb [%g4], %g4
.LL726:
cmp %g4, 48
bne,pt %icc, .LL715
andcc %g2, 64, %g0
ld [%g3+4], %g4
cmp %g4, 0
be,pt %icc, .LL715
andcc %g2, 64, %g0
ld [%fp-24], %g4
cmp %g4, 0
be,pn %icc, .LL727
add %l7, -2, %g3 <--- trashes %g3
ldsb [%g4], %g4
cmp %g4, 46
bne,a,pt %icc, .LL748
xor %l7, 6, %g2
.loc 1 4417 0
ba,pt %xcc, .LL715 <--- returns to .LL715 which needs %g3
andcc %g2, 64, %g0

The corresponding part in the "patched" code is

.LL693:
.LLBE547:
.LLBE578:
.LLBB579:
.LLBB556:
.loc 1 4415 0
cmp %o7, 48
bne,pt %icc, .LL712
andcc %g3, 64, %g0
ld [%g4+4], %o7
cmp %o7, 0
be,pt %icc, .LL712
andcc %g3, 64, %g0
ld [%fp-24], %o7
cmp %o7, 0
be,a,pn %icc, .LL722
add %g1, -2, %g4 <--- store to %g4 is annulled if no branch
ldsb [%o7], %o7
cmp %o7, 46
bne,a,pt %icc, .LL722
add %g1, -2, %g4 <--- store to %g4 is annulled if no branch
.loc 1 4417 0
ba,pt %xcc, .LL712 <--- returns to .LL712 which needs %g4
andcc %g3, 64, %g0

In short, the compiler is trying to hoist the first instruction for line
4456 into the delay slot of that branch to .LL727, and it's getting it
wrong. This would've been fine if the compiler had remembered to put the
"a" (annul) flag on the branch instruction, but it forgot.

I think you need to file a bug with the gcc maintainers. No idea if
there's anything you can give them that's shorter than this full
file, but maybe they won't care.

As a short-term workaround, reducing the -O level might help.
Or perhaps there's a different gcc version you could use?

regards, tom lane

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#10Bruce Momjian
bruce@momjian.us
In reply to: Tom Turelinckx (#1)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

What version of GCC are you testing with?

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#11Tom Turelinckx
tom@turelinckx.be
In reply to: Bruce Momjian (#10)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Tom Lane wrote:

Or perhaps there's a different gcc version you could use?

Greg Stark wrote:

What version of GCC are you testing with?

Wheezy on sparc has gcc-4.4 (4.4.7), gcc-4.6 (4.6.3) and gcc-4.7 (4.7.2), with gcc-4.6 being the default hence the one I was using.

Switching to gcc-4.7 resolved this issue, as well as the failing object_address test against 9.6.3. So, with gcc-4.7 postgresql-9.4.12, postgresql-9.6.3 and postgresql-10~beta1 all build successfully and pass all tests on sparc.

Switching to gcc-4.7 does not solve the alignment issue in 9.4.7 (resolved in 9.4.8) discussed here:

/messages/by-id/20160413094117.GC21485@msg.credativ.de

Tom Lane wrote:

We never did get a clear explanation of why that crashed on Sparc.
I hypothesized over-aggressive compiler assumptions about alignment,
but there was no convincing evidence for that. It might be worth taking
a look at the assembly code immediately around the crash point, especially
if you could also get the corresponding code from 9.4.8.

I can reproduce that crash against 9.4.7 with both gcc-4.6 and gcc-4.7, and will try to provide the relevant assembly snippets.

Best regards,
Tom Turelinckx

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#12Tom Turelinckx
tom@turelinckx.be
In reply to: Tom Lane (#3)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Tom Turelinckx wrote:

Switching to gcc-4.7 does not solve the alignment issue in 9.4.7 (resolved in 9.4.8) discussed here:

/messages/by-id/20160413094117.GC21485@msg.credativ.de

Tom Lane wrote:

We never did get a clear explanation of why that crashed on Sparc.

With gcc 4.7, against postgresql 9.4.7.

Clean 9.4.7 fails contrib/test-decoding:

LOG: server process (PID 18295) was terminated by signal 10: Bus error
DETAIL: Failed process was running: SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');

Reading symbols from /home/turelto/src/947/original/postgresql-9.4-9.4.7/build/src/backend/postgres...done.
[New LWP 18295]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/sparc-linux-gnu/libthread_db.so.1".
Core was generated by `postgres: turelto regression [local] SELECT '.
Program terminated with signal 10, Bus error.
#0 ReorderBufferRestoreChange (data=0x5e1acf "", txn=0x6143a0, rb=0x614318)
at /home/turelto/src/947/original/postgresql-9.4-9.4.7/build/../src/backend/replication/logical/reorderbuffer.c:2327
2327 Size tuplelen = ((HeapTuple) data)->t_len;
(gdb) l
2322 data += tuplelen;
2323 }
2324
2325 if (change->data.tp.newtuple)
2326 {
2327 Size tuplelen = ((HeapTuple) data)->t_len;
2328
2329 change->data.tp.newtuple =
2330 ReorderBufferGetTupleBuf(rb, tuplelen - offsetof(HeapTupleHeaderData, t_bits));
2331

After applying this commit from 9.4.8 against 9.4.7, patched 9.4.7 passes all tests:

https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=0045691

Assembly snippet around line 2327 before patching:

.LBB695:
.LBB694:
.loc 3 52 0
mov 20, %o2
add %o0, 4, %o0
.LVL301:
call memcpy, 0
add %l6, %l4, %l7
.LVL302:
.LBE694:
.LBE695:
.loc 1 2318 0
ld [%l5+28], %g2
.LBB696:
.LBB697:
.loc 3 52 0
mov %l6, %o1
mov %l4, %o2
.LBE697:
.LBE696:
.loc 1 2318 0
add %g2, 35, %g1
and %g1, -8, %g1
.loc 1 2317 0
st %g1, [%g2+20]
.LVL303:
.loc 1 2321 0
ld [%l5+28], %g1
.LBB699:
.LBB698:
.loc 3 52 0
call memcpy, 0
ld [%g1+20], %o0
.LVL304:
.LBE698:
.LBE699:
.LBE691:
.loc 1 2325 0
ld [%l5+32], %g1
.L355:
cmp %g1, 0
be,a,pn %icc, .L356
ld [%i1+84], %g2
.LBB700:
.loc 1 2327 0
ld [%l7], %l6
.LVL305:
.loc 1 2330 0
mov %i0, %o0
call ReorderBufferGetTupleBuf, 0
add %l6, -23, %o1
.LVL306:
.loc 1 2329 0
st %o0, [%l5+32]
.LVL307:
.LBB701:
.LBB702:
.loc 3 52 0
mov %l7, %o1
mov 20, %o2
.LVL308:
call memcpy, 0
add %o0, 4, %o0
.LVL309:
.LBE702:
.LBE701:
.loc 1 2339 0
ld [%l5+32], %g1
.LBB703:
.LBB704:
.loc 3 52 0
add %l7, 20, %o1
.LVL310:
mov %l6, %o2
.LBE704:
.LBE703:
.loc 1 2339 0
add %g1, 35, %g2
and %g2, -8, %g2
.loc 1 2338 0
st %g2, [%g1+20]
.LVL311:
.loc 1 2342 0
ld [%l5+32], %g1
.LBB706:
.LBB705:
.loc 3 52 0
call memcpy, 0
ld [%g1+20], %o0
.LVL312:
.LBE705:
.LBE706:
.LBE700:
.LBB707:
.LBB708:

Assembly snippet after patching:

.LBB699:
.LBB698:
.loc 3 52 0
mov 20, %o2
add %o0, 4, %o0
.LVL301:
call memcpy, 0
add %l6, %l4, %l7
.LVL302:
.LBE698:
.LBE699:
.loc 1 2322 0
ld [%l5+28], %g2
.LBB700:
.LBB701:
.loc 3 52 0
mov %l6, %o1
mov %l4, %o2
.LBE701:
.LBE700:
.loc 1 2322 0
add %g2, 35, %g1
and %g1, -8, %g1
.loc 1 2321 0
st %g1, [%g2+20]
.LVL303:
.loc 1 2325 0
ld [%l5+28], %g1
.LBB703:
.LBB702:
.loc 3 52 0
call memcpy, 0
ld [%g1+20], %o0
.LVL304:
.LBE702:
.LBE703:
.LBE695:
.loc 1 2329 0
ld [%l5+32], %g1
.L355:
cmp %g1, 0
be,a,pn %icc, .L356
ld [%i1+84], %g2
.LVL305:
.LBB704:
.LBB705:
.LBB706:
.loc 3 52 0
ldub [%l7], %g1
.LBE706:
.LBE705:
.loc 1 2338 0
mov %i0, %o0
.LBB708:
.LBB707:
.loc 3 52 0
stb %g1, [%fp-1036]
.LVL306:
ldub [%l7+1], %g1
stb %g1, [%fp-1035]
ldub [%l7+2], %g1
stb %g1, [%fp-1034]
ldub [%l7+3], %g1
stb %g1, [%fp-1033]
.LBE707:
.LBE708:
.loc 1 2338 0
ld [%fp-1036], %o1
call ReorderBufferGetTupleBuf, 0
add %o1, -23, %o1
.LVL307:
.loc 1 2337 0
st %o0, [%l5+32]
.LVL308:
.LBB709:
.LBB710:
.loc 3 52 0
mov %l7, %o1
mov 20, %o2
.LVL309:
call memcpy, 0
add %o0, 4, %o0
.LVL310:
.LBE710:
.LBE709:
.loc 1 2347 0
ld [%l5+32], %g1
.LBB711:
.LBB712:
.loc 3 52 0
add %l7, 20, %o1
.LVL311:
ld [%fp-1036], %o2
.LBE712:
.LBE711:
.loc 1 2347 0
add %g1, 35, %g2
and %g2, -8, %g2
.loc 1 2346 0
st %g2, [%g1+20]
.LVL312:
.loc 1 2350 0
ld [%l5+32], %g1
.LBB714:
.LBB713:
.loc 3 52 0
call memcpy, 0
ld [%g1+20], %o0
.LVL313:
.LBE713:
.LBE714:
.LBE704:
.LBB715:
.LBB716:

Best regards,
Tom Turelinckx

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#13Bruce Momjian
bruce@momjian.us
In reply to: Tom Turelinckx (#11)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

On 27 June 2017 at 13:05, Tom Turelinckx <tom@turelinckx.be> wrote:

What version of GCC are you testing with?

Wheezy on sparc has gcc-4.4 (4.4.7), gcc-4.6 (4.6.3) and gcc-4.7 (4.7.2), with gcc-4.6 being the default hence the one I was using.

Well that's curious.

I revived build farm member burbot which is an Ultra 5 -- also running
wheezy and it didn't report any issues. It's using gcc 4.6 as well:

https://buildfarm.postgresql.org/cgi-bin/show_history.pl?nm=burbot&amp;br=REL9_6_STABLE

--
greg

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

#14Tom Turelinckx
tom@turelinckx.be
In reply to: Bruce Momjian (#13)
Re: Bus error in formatting.c NUM_numpart_to_char (9.4.12, 9.6.3, sparc)

Greg Stark wrote:

I revived build farm member burbot which is an Ultra 5 -- also running wheezy

Thanks for that!

and it didn't report any issues. It's using gcc 4.6 as well:

With default settings, the latest (2017-06-13) buildfarm client builds REL9_4_STABLE successfully for me as well (on wheezy on sparc), passing all tests, with gcc 4.6.

However, I was building from the Debian source packages in the pgdg repository (https://wiki.postgresql.org/wiki/Apt) and their default settings are different from those of the buildfarm client. I've been trying to reconfigure a build farm client such that it would build binaries identical to the Debian source package build (on wheezy on sparc), but I haven't quite succeeded yet.

I did, however, figure out the minimal change to the buildfarm client default settings that makes the numeric regression test fail:

*** build-farm.conf.sample      2017-06-13 17:32:45.000000000 +0200
--- build-farm.conf.fails.minimal       2017-06-29 11:43:40.000000000 +0200
***************
*** 188,194 ****
      config_opts =>[
          qw(
-           --enable-cassert
            --enable-debug
            --enable-nls
            --with-perl
--- 188,193 ----

With gcc 4.6, building without --enable-cassert will make the numeric regression test fail, while building with --enable-cassert will make the build pass all tests. With gcc 4.7, the build will pass all tests in both cases. Tested against REL9_4_STABLE only for the moment.

FWIW, this is where I've gotten with getting the buildfarm client to build as close as possible to the Debian source package (9.4.12 on wheezy on sparc), but it's not quite identical yet:

*** build-farm.conf.sample      2017-06-13 17:32:45.000000000 +0200
--- build-farm.conf.debian      2017-06-29 10:29:26.000000000 +0200
***************
*** 176,181 ****
--- 176,184 ----
          # comment out if not using ccache
          CC => 'ccache gcc',
+         PYTHON => '/usr/bin/python',
+         CFLAGS => '-g -O2 -fstack-protector --param=ssp-buffer-size=4 -Wformat -Werror=format-security -I/usr/include/mit-krb5  -DLINUX_OOM_SCORE_ADJ=0 -D_FORTIFY_SOURCE=2',
+         LDFLAGS => '-Wl,-z,relro -Wl,-z,now -Wl,--as-needed -L/usr/lib/mit-krb5 -L/usr/lib/sparc-linux-gnu/mit-krb5',
      },

# don't use --prefix or --with-pgport here
***************
*** 188,204 ****

config_opts =>[
qw(
! --enable-cassert
! --enable-debug
! --enable-nls
--with-perl
--with-python
! --with-tcl
! --with-gssapi
--with-openssl
- --with-ldap
--with-libxml
--with-libxslt
)
],

--- 191,217 ----
      config_opts =>[
          qw(
!           --with-tcl
            --with-perl
            --with-python
!           --with-pam
            --with-openssl
            --with-libxml
            --with-libxslt
+           --with-tclconfig=/usr/lib/tcl8.5
+           --with-includes=/usr/include/tcl8.5
+           --enable-nls
+           --enable-integer-datetimes
+           --enable-thread-safety
+           --enable-tap-tests
+           --enable-debug
+           --disable-rpath
+           --with-uuid=e2fs
+           --with-gnu-ld
+           --with-system-tzdata=/usr/share/zoneinfo
+           --with-krb5
+           --with-gssapi
+           --with-ldap
            )
      ],

Best regards,
Tom Turelinckx

--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs