#!/bin/bash
: <<END_COMMENT

  Migration of PostgreSQL's documentation from Docbook 4.x
  to Docbook 5.2 format.

  J. Purtz, juergen@purtz.de, September, 2022

  Description: see README.md


END_COMMENT


# --------------  individual environment  -------------------


# Adopt the three variables to your situation!

ToolDir=~/02_IT/82_pgWork/docbook4_to_docbook5.2
FromRoot=~/02_IT/83_pgSrc/postgresql
ToRoot=~/02_IT/83_pgSrc/postgresql_db5.2

# derived variables
FromSgmlDir=${FromRoot}/doc/src/sgml
ToSgmlDir=${ToRoot}/doc/src/sgml


# ------------------  start  ---------------------------------

doInit=true
doUpgrade=true
doRealModifications=true
doValidation=true
doSgmlDiff=false
doHtmlDiff=false

cd $ToolDir


# ------------------  init  ---------------------------------
# remove previous conversion and re-generate conplete file
# structure including git

if [ $doInit == true ]; then
  echo -e "\nINIT ..."

  # remove old directory structure and create new one
  if [ -d $ToRoot ]; then
    echo -e "Removing complete old conversion"
    rm -rf $ToRoot
  fi
  echo -e "Creating directory structure and copying existing files\n"
  cp -r $FromRoot $ToRoot

  # supply entities and Relax NG schema
  cp htmlmathml-f.ent $ToSgmlDir
  cp docbook.rng $ToSgmlDir

  # remove validation via xmllint (temporary during development)
  sed -i -E -e 's/	(\$\(XMLLINT\))/	# \1/' $ToSgmlDir/Makefile

  # a modification on one of our Perl scripts
  sed -i -E -e 's/ id=/ xml:id=/' $ToSgmlDir/generate-keywords-table.pl

fi


# -----  db4 --> db5.2  ---------------------------------
# This step contains some dummy changes to cheat xsltproc.
# They are reverted after xsltproc has done its job.
#
# Among others the previous init-step has copied the sgml files to the 'to' directory.
# Here they are overwritten by a modified form of the 'from' files (but only sgml files,
# not xsl, css, .git, ... files)

if ($doUpgrade == true); then
  echo -e "\ndb4 -> db5 upgrade ...\n"

  for fromFile in $FromSgmlDir/*sgml $FromSgmlDir/ref/*sgml; do

    echo $fromFile
    if [[ $fromFile =~ "/ref/" ]]; then
      toFile=$ToSgmlDir/ref/$(basename $fromFile)
    else
      toFile=$ToSgmlDir/$(basename $fromFile)
    fi

    if [[ $fromFile =~ "filelist.sgml" ]]; then
      # move parameter entity 'reference' from postgres.sgml to here
      sed -i -e '/archive-modules/a \<!ENTITY reference  SYSTEM "reference.sgml"\>' $toFile
      # Change name of PG's entity 'parallel' to 'parallel-query' because it is an
      # predefined entity. But keep the original filename!
      sed -i -e 's/parallel\s*SYSTEM /parallel-query SYSTEM /' $toFile
      sed -i -e "/parallel-query/i <!-- in a global context the entity-name \'parallel\' has a different meaning -->" $toFile
    fi

    # omit some special files from the conversion
    if [[ $fromFile =~ "filelist.sgml"             ||    \
          $fromFile =~ "allfiles.sgml"             ||    \
          $fromFile =~ "features-supported.sgml"   ||    \
          $fromFile =~ "features-unsupported.sgml" ||    \
          $fromFile =~ "errcodes-table.sgml"       ||    \
          $fromFile =~ "keywords-table.sgml"       ||    \
          $fromFile =~ "version.sgml"                    \
        ]]; then
      continue
    fi

	
    if [[ $fromFile =~ "postgres.sgml" ]]; then
      # In postgres.sgml we will conserve some entity-definitions 
      cat  $fromFile  >tmp.sgml
      # consider the changed entity name
      sed -i -e 's/&parallel;/\&parallel-query;/' tmp.sgml
    else
      # Some sgml files are not well-formed, they have multiple root elements.
      # We surround all files with a single <dummy> tag and remove it after
      # the conversion.
      echo "<dummy>" >tmp.sgml
      cat  $fromFile  >>tmp.sgml
      echo "</dummy>" >>tmp.sgml
    fi

    # There is no way to hinder the xslt processor to replace entities
    # by their values. Hide them and restore the original syntax
    # later, eg: &version;  -->  _amp_version;  --> &version;
    sed -i -e '1,$s/&/_amp_/g' tmp.sgml
    # Two external entities in postgres.sgml will be thrown away by the
    # conversion anyway. We will recover them later.
    sed -i -e '1,$s/%version;//' tmp.sgml
    sed -i -e '1,$s/%filelist;//' tmp.sgml

    # If <pubdate> contains a valid day, the xslt processor outputs it in
    # a different syntax. Avoid this by changing the tag-name.
    sed -i -e '1,$s/pubdate>/pubdatex>/g' tmp.sgml

    # -----  perform the conversion -----
    # the db4->db5 migration step which is developed by the DocBook team; ignore some frequent warnings
    xsltproc --novalid --nodtdattr --encoding "utf-8" $ToolDir/db4-upgrade.xsl tmp.sgml \
           2> >(grep -Pv "(Found \w* inside |Converting ulink to (link|uri)\. )") >$toFile
    # -----------------------------------

    # restore <pubdate>
    sed -i -e '1,$s/pubdatex>/pubdate>/g' $toFile

    # restore entities
    sed -i -e '1,$s/_amp_/\&/g' $toFile

    if [[ $fromFile =~ "postgres.sgml" ]]; then

      # postgres.sgml needs further modifications
      # a) remove first line (the conversion-comment) and
      #    second line (<book...>
      sed -i -e '1,2d' $toFile
      # b) insert entity definitions to the head of postgres.sgml
      cat <<EOT >tmp.sgml
<!-- doc/src/sgml/postgres.sgml -->

<!--
The (outdated) use of DOCTYPE serves merely for the definition
of entities, character entities as well as parameter entities.
The validation process doesn't use this DTD-syntax, it uses
DocBook's RELAX NG schema where entities are unknown.

The use of character entities (eg: &mdash;) instead of hex-values
supports the readability of the source for authors.

The replacement of parameter entities (eg: %filelist;) with
xi:include as the more XML-conform syntax isn't possible without
major changes in most files:
  - Every xml/sgml-file needs a single root element.
  - In every xml/sgml-file we must re-declare namespace(s).
The reason is that parameter entities perform a plain text
substitution whereas xi:include creates trees and combines them.
-->

<!DOCTYPE book [

<!--  a summary of publicly predefined character entities: html,
      latin1, MathML, ...  -->
<!ENTITY % htmlmathml-f PUBLIC
         "-//W3C//ENTITIES HTML MathML Set//EN//XML"
         "https://www.w3.org/2003/entities/2007/htmlmathml-f.ent">
%htmlmathml-f;

<!-- ???  if working with xmlint or xsltproc, we need a local installation  ???
<!ENTITY % htmlmathml-f-local SYSTEM "htmlmathml-f.ent">
%htmlmathml-f-local;
-->

<!-- PG specific entities -->
<!ENTITY % version SYSTEM "version.sgml">
%version;
<!ENTITY % filelist SYSTEM "filelist.sgml">
%filelist;

<!--
Zero-width space.  Use this to allow line breaks at desirable places in
table cells, examples, etc. without causing an unwanted space when the
break is not needed in a wider output rendering.  The name is an abbreviation
for the publicly known entity 'ZeroWidthSpace'.
-->
<!ENTITY zwsp "&#x200B;">

]>

<book xmlns="http://docbook.org/ns/docbook"
      xmlns:xlink="http://www.w3.org/1999/xlink"
      xmlns:xi="http://www.w3.org/2001/XInclude"
      xmlns:m="http://www.w3.org/1998/Math/MathML"
      version="5.2" xml:id="postgres">

EOT

      cat $toFile >>tmp.sgml
      cp tmp.sgml $toFile

    else
      # remove <dummy> tag: first (a comment), second, and last line
      sed -i -e '1,2d;$d' $toFile

    # remove unnecessary namespace declaration (xsltproc insists to read it in every file)
    #sed -i -e '1,$s/ xmlns:xlink=\"http:\/\/www.w3.org\/1999\/xlink\"//' $toFile

    fi

  done

  # remove the latest version of tmp.sgml
  rm tmp.sgml

  # ???  for some reason the attribute 'id' within 3 files is not converted  ????
  sed -i -e 's/ id=/ xml:id=/' $ToSgmlDir/ref/alter_conversion.sgml
  sed -i -e 's/ id=/ xml:id=/' $ToSgmlDir/ref/create_conversion.sgml
  sed -i -e 's/ id=/ xml:id=/' $ToSgmlDir/ref/drop_conversion.sgml
  sed -i -e 's/ id=/ xml:id=/' $ToSgmlDir/keywords-table.sgml

  # There is an xml-file with a different formatting and only few changes. We
  # do the conversion manually.
  toFile=$ToSgmlDir/standalone-install.xml
  cp   $FromSgmlDir/standalone-install.xml $toFile
  sed -i -e '2d;1,$s/ id=/ xml:id=/'       $toFile

fi


# -----  real modifications  ---------------------------
if ($doRealModifications == true); then

  echo -e "\nRealModifications ...\n"

  cd $ToSgmlDir

  # part 1:  replace '&' with '&amp;' at certain places

  sed -i -e 's/a&b/a\&amp;b/'       func.sgml
  sed -i -e 's/&input_mask/\&amp;input_mask/;s/&binaryIntVal/\&amp;binaryIntVal/' libpq.sgml
  sed -i -e 's/&date1/\&amp;date1/; s/&ts1,/\&amp;ts1,/; s/&iv1,/\&amp;iv1,/; s/&tsout)/\&amp;tsout)/' ecpg.sgml
  sed -i -e 's/ &s)/ \&amp;s)/; s/ &i)/ \&amp;i)/; s/,&(index)/,\&amp;(index)/; s/,&(result)/,\&amp;(result)/' ecpg.sgml
  sed -i -e 's/ &isnull)/ \&amp;isnull)/; s/ &tupdesc)/ \&amp;tupdesc)/; ' xfunc.sgml
  sed -i -e 's/ &x, &y)/ \&amp;x, \&amp;y)/; s/&buf)/\&amp;buf)/; s/&buf,/\&amp;buf,/' xtypes.sgml
  sed -i -e 's/ && / \&amp;\&amp; /; s/ &isnull)/ \&amp;isnull)/;' trigger.sgml

  # part 2: real modifications (separate script)

  . $ToolDir/doRealModifications.sh $ToSgmlDir

fi


# -----  validate the result  ---------------------------
if ($doValidation == true); then

  echo -e "\nValidation ...\n"

  cd $ToSgmlDir

  # assemble huge file (works only with local ent file)
  #xmllint --noent postgres.sgml >postgres_all.sgml
  #xmllint --noout --relaxng $ToolDir/docbook.rng postgres.sgml 2>&1 | more
  #xmllint --noout --relaxng $ToolDir/docbook.rng postgres_all.sgml 2>&1 | more

  jing $ToolDir/docbook.rng postgres.sgml 


fi

# -----  report SGML differences  ---------------------------
if ($doSgmlDiff == true); then

  # 'diff' plus 'egrep' to ignore or search for patterns

  echo -e "\nDifferences in SGML ...\n"

  cd $FromSgmlDir

  # check sgml
  for fromFile in $FromSgmlDir/*.sgml; do # $FromSgmlDir/ref/*.sgml; do
    if [[ $fromFile =~ "/ref/" ]]; then
      tmpFile=$ToSgmlDir/ref/$(basename $fromFile)
    else
      tmpFile=$ToSgmlDir/$(basename $fromFile)
    fi

    echo -e '\n**********  '`basename $fromFile`'  **********'
    # diff -y --suppress-common-lines --width=140 $fromFile $tmpFile  |\
    diff $fromFile $tmpFile  |\
         grep -E -v '(xml:id=|id=)'  |\
         grep -E -v '(structname|structfield)'
  done
fi


# -----  report HTML differences  ---------------------------
if ($doHtmlDiff == true); then

  # 'diff' plus 'egrep' to ignore or search for patterns
  echo -e "\nDifferences in HTML ...\n"

  # create html files
  if false; then
    cd $ToSgmlDir
    make STYLE=website
    cd $FromSgmlDir
    make STYLE=website
  fi

  # check html files
  for fromFile in $FromSgmlDir/html/*.html; do
    tmpFile=$ToSgmlDir/html/$(basename $fromFile)

    echo -e '\n**********  '`basename $fromFile`'  **********'
    #diff -w -y --suppress-common-lines --width=300 $fromFile $tmpFile |\
    diff $fromFile $tmpFile |\
         grep -E -v -w '(link|ulink)'   | \
         grep -E -v ' (class=\"structfield\"|class=\"structname\"|class=\"varname\")'  |\
         grep -E -v 'No newline at end of file'
  done

fi


