echo $HOSTNAME

TMPDATA=/hdd/scratch/data2
INST=/home/jjanes/pgsql/torn_biect/
JJXID='--JJ_xid=400'
JJTORN='--JJ_torn_page=1500'
JJTORNOFF='--JJ_torn_page=0'

export PGPORT=9876

on_exit() {
  echo "Cleaning up"
  $INST/bin/pg_ctl -D $TMPDATA stop -m immediate -w
  ## don't clean up on normal exit, either, as cleaning up by hand is easy 
  ## and it is hard to know that all failure modes go through the other one.
  # rm -r $TMPDATA
  exit 0
}

on_error() {
  echo "Exiting on error"
  $INST/bin/pg_ctl -D $TMPDATA stop -m immediate -w
  # don't clean up, to preserve for forensic analysis.
  # rm -r $TMPDATA/
  exit 1
};

trap 'on_exit' USR2;
trap 'on_exit' INT;

$INST/bin/pg_ctl -D $TMPDATA stop -m immediate -w
rm -r $TMPDATA
$INST/bin/initdb -k -D $TMPDATA || exit

cat <<END  >> $TMPDATA/postgresql.conf
##  Jeff's changes to config for use with recovery stress testing

## can't do these two when testing 9.3
max_replication_slots=3
wal_level = logical

wal_keep_segments=20  ## preserve some evidence

##  Crashes are driven by checkpoints, so we want to do them often
#checkpoint_segments = 1
max_wal_size = 48MB 
min_wal_size = 32MB

max_prepared_transactions = 40

checkpoint_timeout = 30s
checkpoint_warning = 0
#archive_mode = on
## There is a known race condition that sometimes causes auto restart to fail when archiving is on.
## that is annoying, so turn it off unless we specifically want to test the on condition.
archive_mode = off
archive_command = 'echo archive_command %p %f `date`'       # Don't actually archive, just make pgsql think we are
archive_timeout = 30
track_io_timing=on
autovacuum_naptime = 2s
## if updates are not HOT, the table/index can easily bloat faster than default throttled autovac can possibly
## cope. So crank it up.
autovacuum_vacuum_cost_delay = 0ms
vacuum_cost_page_dirty = 0
## I think the default values of these is kind of bogus anyway.
vacuum_cost_page_hit = 0
vacuum_cost_page_miss = 0
log_line_prefix = '%p %i %e %m:'
restart_after_crash = on
## Since we crash the PG software, not the OS, fsync does not matter as the surviving OS is obligated to provide a 
## consistent view of the written-but-not-fsynced data even after PG restarts.  Turning it off gives more 
## testing per unit of time.
fsync=off
shared_preload_libraries = 'pg_stat_statements' 

wal_compression=1
track_commit_timestamp=1
END

## The below settings create too much noise to leave them on all the time.
## But once a problem is found, these are some of the first things to try
## for diagnosing it

cat <<END  >> $TMPDATA/postgresql.conf
#log_error_verbosity = verbose
#log_checkpoints = on
#log_autovacuum_min_duration=0
#JJ_vac=1
END

$INST/bin/pg_ctl -D $TMPDATA start -w || exit
$INST/bin/createdb
$INST/bin/psql -c 'select version()'
$INST/bin/psql -c " SELECT name, current_setting(name), SOURCE FROM pg_settings WHERE SOURCE NOT IN ('default', 'override');"
$INST/bin/pg_config
$INST/bin/psql -c 'create extension pageinspect'
$INST/bin/psql -c 'create extension pgstattuple'
$INST/bin/psql -c 'create extension pg_stat_statements'
$INST/bin/psql -c 'create extension pg_buffercache'
$INST/bin/psql -c "create extension pg_freespacemap"
$INST/bin/psql -c "create extension pg_visibility"
$INST/bin/psql -c "create extension btree_gist"
$INST/bin/psql -c "create extension btree_gin"

##  run the initial load now, before JJ_torn_page is turned on,
##  or else we crash before even getting the table initialized due to WAL of the GIN or GIST index build.
perl count.pl 8 0|| on_error; 

#while (true) ; do  psql -c "\dit+ ";  sleep 5; done &

# keep an eye on the twophase directly.  Detect present of files as fast as we can
# once detected throttle back a bit to spare screen scrolling
## issue this was used for was fixed in 481725c0ba731b77fb32c
#perl -e 'while (1) { my $x=`ls -C /tmp/data2/pg_twophase/`; print $x; select undef,undef,undef,0.1 if $x}' &
# destroy on purpose
#perl -e 'while (1) { my $x=`rm /tmp/data2/pg_twophase/* 2> /dev/null`; }' &
#perl -e 'while (1) { my $x=`ls /tmp/data2/pg_subtrans/|wc`; print "number of pg_subtrans $x"; select undef,undef,undef,5}' &


for g in `seq 1 1000` ; do
  $INST/bin/pg_ctl -D $TMPDATA restart -o "--ignore_checksum_failure=0 $JJTORN $JJXID" -w
  echo JJ starting loop $g;
  for f in `seq 1 100`; do 
    #$INST/bin/psql -c 'SELECT datname, datfrozenxid, age(datfrozenxid) FROM pg_database;'; 
    ## on_error is needed to preserve database for inspection.  Otherwise autovac will destroy evidence.
    perl count.pl 24 || on_error; 
  done;
  echo JJ ending loop $g;
  ## give autovac a chance to run to completion
  ## need to disable crashing, as sometimes the vacuum itself triggers the crash
  ## and sometimes the restart also crashes during shutdown, so try it twice if needed.
  $INST/bin/pg_ctl -D $TMPDATA restart -o "--ignore_checksum_failure=0 $JJTORNOFF $JJXID" -w || (sleep 5; \
  $INST/bin/pg_ctl -D $TMPDATA restart -o "--ignore_checksum_failure=0 $JJTORNOFF $JJXID" -w || on_error;)
  ## trying to get autovac to work in the face of consistent crashing 
  ## is just too hard, so do manual vacs unless autovac is specifically 
  ## what you are testing.
  $INST/bin/vacuumdb -a || on_error;
  ## or sleep a few times in the hope autovac can get it done, if you want to test that.
  #$INST/bin/psql -c 'select pg_sleep(120)' || (sleep 5; $INST/bin/psql -c 'select pg_sleep(120)') || (sleep 5; $INST/bin/psql -c 'select pg_sleep(120)')## give autovac a chance to do its thing
  echo JJ ending sleep after loop $g;
done;
on_exit
