[9.3 bug fix] backends emit hundreds of messages when statistics file is inaccessible
Hello,
I've found a bug in PostgreSQL 9.3 beta 2 which is related to statistics
collection which emits excesssive amount of error messages. Please find
attached the patch to fix this. I confirmed this problem occurs in
PostgreSQL 9.2, too. Could you backport this?
[Problem]
If the directory specified by stats_temp_directory becomes inaccessible to
postgres, enormous amount of WARNING messages are output. For example, when
I set wrong permission on the directory by mistake while the database server
is running and then started psql, more than 600 messages were emitted in a
few seconds like this:
WARNING: could not open statistics file
"/work/maumau/stats_tmp/pgstat.stat": Permission denied
LOG: could not open temporary statistics file
"/work/maumau/stats_tmp/pgstat.tmp": Permission denied
WARNING: could not open statistics file
"/work/maumau/stats_tmp/pgstat.stat": Permission denied
WARNING: could not open statistics file
"/work/maumau/stats_tmp/pgstat.stat": Permission denied
WARNING: could not open statistics file
"/work/maumau/stats_tmp/pgstat.stat": Permission denied
...
Even when there's no client, these messages are output regularly. I guess
this is because autovacuum runs.
[Cause]
In backend_read_statsfile(), the backend loops PGSTAT_POLL_LOOP_COUNT
(=1000) times trying to open the statistics file. The backend emits a
WARNING message if it fails to open the file in every iteration of the loop.
[How to fix]
Emit the message upon the first failed attempt. For some diagnostics
purposes, emit the message with DEBUG2 level on the subsequent attempts.
DEBUG2 is arbitrary.
Regards
MauMau
Attachments:
reduce_statsfile_error.patchapplication/octet-stream; name=reduce_statsfile_error.patchDownload
diff -rpcd orig/src/backend/postmaster/pgstat.c new/src/backend/postmaster/pgstat.c
*** orig/src/backend/postmaster/pgstat.c 2013-06-25 03:55:41.000000000 +0900
--- new/src/backend/postmaster/pgstat.c 2013-07-08 16:32:44.000000000 +0900
*************** done:
*** 4121,4127 ****
*/
static bool
pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
! TimestampTz *ts)
{
PgStat_StatDBEntry dbentry;
PgStat_GlobalStats myGlobalStats;
--- 4121,4127 ----
*/
static bool
pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
! TimestampTz *ts, int elevel)
{
PgStat_StatDBEntry dbentry;
PgStat_GlobalStats myGlobalStats;
*************** pgstat_read_db_statsfile_timestamp(Oid d
*** 4136,4142 ****
if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
{
if (errno != ENOENT)
! ereport(pgStatRunningInCollector ? LOG : WARNING,
(errcode_for_file_access(),
errmsg("could not open statistics file \"%s\": %m",
statfile)));
--- 4136,4142 ----
if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
{
if (errno != ENOENT)
! ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open statistics file \"%s\": %m",
statfile)));
*************** pgstat_read_db_statsfile_timestamp(Oid d
*** 4149,4155 ****
if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
format_id != PGSTAT_FILE_FORMAT_ID)
{
! ereport(pgStatRunningInCollector ? LOG : WARNING,
(errmsg("corrupted statistics file \"%s\"", statfile)));
FreeFile(fpin);
return false;
--- 4149,4155 ----
if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
format_id != PGSTAT_FILE_FORMAT_ID)
{
! ereport(elevel,
(errmsg("corrupted statistics file \"%s\"", statfile)));
FreeFile(fpin);
return false;
*************** pgstat_read_db_statsfile_timestamp(Oid d
*** 4161,4167 ****
if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
fpin) != sizeof(myGlobalStats))
{
! ereport(pgStatRunningInCollector ? LOG : WARNING,
(errmsg("corrupted statistics file \"%s\"", statfile)));
FreeFile(fpin);
return false;
--- 4161,4167 ----
if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
fpin) != sizeof(myGlobalStats))
{
! ereport(elevel,
(errmsg("corrupted statistics file \"%s\"", statfile)));
FreeFile(fpin);
return false;
*************** pgstat_read_db_statsfile_timestamp(Oid d
*** 4186,4192 ****
if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
fpin) != offsetof(PgStat_StatDBEntry, tables))
{
! ereport(pgStatRunningInCollector ? LOG : WARNING,
(errmsg("corrupted statistics file \"%s\"",
statfile)));
goto done;
--- 4186,4192 ----
if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
fpin) != offsetof(PgStat_StatDBEntry, tables))
{
! ereport(elevel,
(errmsg("corrupted statistics file \"%s\"",
statfile)));
goto done;
*************** pgstat_read_db_statsfile_timestamp(Oid d
*** 4208,4214 ****
goto done;
default:
! ereport(pgStatRunningInCollector ? LOG : WARNING,
(errmsg("corrupted statistics file \"%s\"",
statfile)));
goto done;
--- 4208,4214 ----
goto done;
default:
! ereport(elevel,
(errmsg("corrupted statistics file \"%s\"",
statfile)));
goto done;
*************** backend_read_statsfile(void)
*** 4231,4236 ****
--- 4231,4237 ----
TimestampTz min_ts = 0;
TimestampTz ref_ts = 0;
int count;
+ int elevel = WARNING;
/* already read it? */
if (pgStatDBHash)
*************** backend_read_statsfile(void)
*** 4250,4256 ****
CHECK_FOR_INTERRUPTS();
! ok = pgstat_read_db_statsfile_timestamp(MyDatabaseId, false, &file_ts);
cur_ts = GetCurrentTimestamp();
/* Calculate min acceptable timestamp, if we didn't already */
--- 4251,4257 ----
CHECK_FOR_INTERRUPTS();
! ok = pgstat_read_db_statsfile_timestamp(MyDatabaseId, false, &file_ts, elevel);
cur_ts = GetCurrentTimestamp();
/* Calculate min acceptable timestamp, if we didn't already */
*************** backend_read_statsfile(void)
*** 4317,4322 ****
--- 4318,4327 ----
if (ok && file_ts >= min_ts)
break;
+ /* Avoid swamping the server log file with failure messages */
+ if (!ok)
+ elevel = DEBUG2;
+
/* Not there or too old, so kick the collector and wait a bit */
if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
pgstat_send_inquiry(cur_ts, min_ts, MyDatabaseId);
"MauMau" <maumau307@gmail.com> writes:
I've found a bug in PostgreSQL 9.3 beta 2 which is related to statistics
collection which emits excesssive amount of error messages. Please find
attached the patch to fix this. I confirmed this problem occurs in
PostgreSQL 9.2, too. Could you backport this?
It's not apparent that this is a good idea, let alone a bug fix.
If the directory specified by stats_temp_directory becomes inaccessible to
postgres, enormous amount of WARNING messages are output.
Well, yeah, because all of that functionality just broke. Not warning
about it doesn't seem like a good idea. AFAICT, your patch would result
in the situation not being remarked on anywhere except in the postmaster
log.
regards, tom lane
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
From: "Tom Lane" <tgl@sss.pgh.pa.us>
"MauMau" <maumau307@gmail.com> writes:
If the directory specified by stats_temp_directory becomes inaccessible
to
postgres, enormous amount of WARNING messages are output.Well, yeah, because all of that functionality just broke. Not warning
about it doesn't seem like a good idea. AFAICT, your patch would result
in the situation not being remarked on anywhere except in the postmaster
log.
No, my patch makes backend_read_statsfile_timestamp() emit just one warning
message instead of outputing the same message 1,000 times in one call. So
your concern does not exist. Is this OK?
Regards
MauMau
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers