From dfe5ad9229e812d0f805d6a39cb296b9284e30f2 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 18 Oct 2023 16:18:58 -0700
Subject: [PATCH v4 2/4] heavily wip: update typedefs

---
 meson.build                           | 14 ++++-
 src/backend/jit/llvm/meson.build      |  2 +-
 src/backend/snowball/meson.build      |  2 +-
 src/timezone/meson.build              |  2 +-
 src/tools/pgindent/merge_typedefs     | 86 +++++++++++++++++++++++++++
 src/tools/pgindent/meson.build        | 69 +++++++++++++++++++++
 src/tools/pgindent/typedefs_from_objs | 42 +++++++++++++
 7 files changed, 213 insertions(+), 4 deletions(-)
 create mode 100755 src/tools/pgindent/merge_typedefs
 create mode 100644 src/tools/pgindent/meson.build
 create mode 100755 src/tools/pgindent/typedefs_from_objs

diff --git a/meson.build b/meson.build
index 2cfeb60eb07..492ce9a48b1 100644
--- a/meson.build
+++ b/meson.build
@@ -2579,6 +2579,7 @@ add_project_link_arguments(ldflags, language: ['c', 'cpp'])
 
 # list of targets for various alias targets
 backend_targets = []
+data_targets = []
 bin_targets = []
 pl_targets = []
 contrib_targets = []
@@ -2894,6 +2895,8 @@ subdir('src/interfaces/ecpg/test')
 
 subdir('doc/src/sgml')
 
+subdir('src/tools/pgindent')
+
 generated_sources_ac += {'': ['GNUmakefile']}
 
 # After processing src/test, add test_install_libs to the testprep_targets
@@ -2982,6 +2985,7 @@ endif
 all_built = [
   backend_targets,
   bin_targets,
+  data_targets,
   libpq_st,
   pl_targets,
   contrib_targets,
@@ -3017,12 +3021,20 @@ run_target('install-test-files',
 # Indentation and similar targets
 ###############################################################
 
+# If the dependencies for generating a local typedefs.list are fulfilled, we
+# use a combination of a locally built and the source tree's typededefs.list
+# file (see src/tools/pgindent/meson.build) for reindenting. That ensures
+# newly added typedefs are indented correctly.
 indent_base_cmd = [perl, files('src/tools/pgindent/pgindent'),
     '--indent', pg_bsd_indent.full_path(),
     '--sourcetree=@SOURCE_ROOT@']
 indent_depend = [pg_bsd_indent]
 
-# reindent the entire tree
+if typedefs_supported
+  indent_base_cmd += ['--typedefs', typedefs_merged.full_path()]
+  indent_depend += typedefs
+endif
+
 # Reindent the entire tree
 run_target('indent-tree',
   command: indent_base_cmd + ['.'],
diff --git a/src/backend/jit/llvm/meson.build b/src/backend/jit/llvm/meson.build
index 8ffaf414609..7e2f6d5bd5c 100644
--- a/src/backend/jit/llvm/meson.build
+++ b/src/backend/jit/llvm/meson.build
@@ -82,4 +82,4 @@ llvmjit_types = custom_target('llvmjit_types.bc',
   install_dir: dir_lib_pkg,
   depfile: '@BASENAME@.c.bc.d',
 )
-backend_targets += llvmjit_types
+data_targets += llvmjit_types
diff --git a/src/backend/snowball/meson.build b/src/backend/snowball/meson.build
index 0f669c0bf3c..69e777399fd 100644
--- a/src/backend/snowball/meson.build
+++ b/src/backend/snowball/meson.build
@@ -94,4 +94,4 @@ install_subdir('stopwords',
 )
 
 backend_targets += dict_snowball
-backend_targets += snowball_create
+data_targets += snowball_create
diff --git a/src/timezone/meson.build b/src/timezone/meson.build
index 7b85a01c6bd..779a51d85a4 100644
--- a/src/timezone/meson.build
+++ b/src/timezone/meson.build
@@ -50,7 +50,7 @@ if get_option('system_tzdata') == ''
     install_dir: dir_data,
   )
 
-  bin_targets += tzdata
+  data_targets += tzdata
 endif
 
 subdir('tznames')
diff --git a/src/tools/pgindent/merge_typedefs b/src/tools/pgindent/merge_typedefs
new file mode 100755
index 00000000000..9826fd1a675
--- /dev/null
+++ b/src/tools/pgindent/merge_typedefs
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+# Tool to build a new typedefs.list file from
+# a) the current typedefs.list file in the source tree
+# b) newly generated typedefs.list fragments for build products
+# c) the source code
+#
+# To avoid littering the typedefs file with typedefs that are not in our source
+# code, the list of typedefs found is filtered with tokens found in source
+# files.
+#
+# If the local build references a typedef that's not present in the source
+# typedefs.list, we can discover such typedefs.
+#
+# If the source typedefs.list references a typedef that is not used anymore, it
+# will not appear in the source tree anymore and thus can be filtered out.
+#
+# However, typedefs in newly written code for a different platform (or a
+# disabled build option), will not be discovered.
+
+import argparse
+import os
+import subprocess
+import sys
+import re
+
+parser = argparse.ArgumentParser(
+    description='build new typedefs files from multiple inputs')
+
+parser.add_argument('--output-local', type=argparse.FileType('w'), required=True)
+parser.add_argument('--output-merged', type=argparse.FileType('w'), required=True)
+parser.add_argument('--current-typedefs', type=argparse.FileType('r'), required=True)
+parser.add_argument('--filter-source', type=str, required=False)
+parser.add_argument('input', type=argparse.FileType('r'), nargs='*')
+
+
+def merge_typedefs(files):
+    typedefs = set()
+    for input in files:
+        for typedef in input.readlines():
+            typedefs.add(typedef.strip())
+    return typedefs
+
+def source_list(srcdir):
+    filelist = []
+    os.chdir(srcdir)
+    accepted_files = re.compile(r'\.(c|h|l|y|.cpp)$')
+    for root, dirs, files in os.walk('.', topdown=True):
+        # don't go into hidden dirs or docs
+        for d in dirs:
+            if d.startswith('.') or d == 'doc':
+                dirs.remove(d)
+        for f in files:
+            if accepted_files.search(f):
+                filelist.append((os.path.join(root, f)))
+    return filelist
+
+def tokenize_sources(files):
+    comment_sub_re = re.compile(r'/\*.*?\*/', flags=re.MULTILINE|re.DOTALL)
+    token_find_re = re.compile(r'\b\w+\b')
+
+    tokens = set()
+    for file in files:
+        with open(file, 'r') as fh:
+            content = fh.read()
+            content = comment_sub_re.sub('', content)
+            tokens.update(token_find_re.findall(content))
+    return tokens
+
+def main(args):
+    local_typedefs = merge_typedefs(args.input)
+    current_typedefs = merge_typedefs([args.current_typedefs])
+
+    if args.filter_source:
+        filelist = source_list(args.filter_source)
+        tokens = tokenize_sources(filelist)
+        local_typedefs.intersection_update(tokens)
+        current_typedefs.intersection_update(tokens)
+
+    current_typedefs.update(local_typedefs)
+    print('\n'.join(sorted(local_typedefs)), file=args.output_local)
+    print('\n'.join(sorted(current_typedefs)), file=args.output_merged)
+
+if __name__ == '__main__':
+    main(parser.parse_args())
+    sys.exit(0)
diff --git a/src/tools/pgindent/meson.build b/src/tools/pgindent/meson.build
new file mode 100644
index 00000000000..de02c676a46
--- /dev/null
+++ b/src/tools/pgindent/meson.build
@@ -0,0 +1,69 @@
+# Currently the code requires meson 0.63 or upwards. This could likely be
+# lifted (in fact, some older versions actually work, but only some), but for
+# now it's not crucial.
+typedefs_supported = meson.version().version_compare('>=0.63')
+if not typedefs_supported
+  subdir_done()
+endif
+
+typedefs_from_objs = files('typedefs_from_objs')
+typedefs_from_objs_cmd = [typedefs_from_objs, '--host', host_system, '--output', '@OUTPUT@', '@INPUT@']
+merge_typedefs = files('merge_typedefs')
+
+# XXX: This list of targets should likely not be maintained here
+typedef_src_tgts = [
+  backend_targets,
+  bin_targets,
+  [libpq_st],
+  pl_targets,
+  contrib_targets,
+  ecpg_targets,
+]
+
+# We generate partial typedefs files for each binary/library. That makes using
+# this during incremental development much faster.
+#
+# The reason we process the object files instead of executables is that that
+# doesn't work on macos. There doesn't seem to be a reason to target
+# executables directly on other platforms.
+typedef_tgts = []
+foreach tgts : typedef_src_tgts
+  foreach tgt : tgts
+    # can't use tgt.name(), as we have have targets that differ just in suffix
+    name = fs.name(tgt.full_path())
+    tdname = 'typedefs.list.local-@0@'.format(name)
+    objs = tgt.extract_all_objects(recursive: true)
+    typedef_tgts += custom_target(tdname,
+                                  output: tdname,
+                                  command: typedefs_from_objs_cmd,
+                                  input: objs,
+                                  build_by_default: false,
+                                  )
+
+  endforeach
+endforeach
+
+
+# Build new typedefs.list files. typedefs.list.local will contain only
+# typedefs in the local build, whereas typedefs.list.merged is the combination
+# of typedefs.list.local and the source tree's typedefs.list. However,
+# typedefs that are not used in the code anymore are removed even from
+# typedefs.list.merged.
+typedefs = custom_target('typedefs.list', output: ['typedefs.list.local', 'typedefs.list.merged'],
+  command: [merge_typedefs,
+            '--output-local', '@OUTPUT0@',
+            '--output-merged', '@OUTPUT1@',
+            '--current-typedefs', files('typedefs.list'),
+            '--filter-source', '@SOURCE_ROOT@',
+            '@INPUT@'],
+  input: typedef_tgts,
+  build_by_default: false,
+)
+typedefs_local = typedefs[0]
+typedefs_merged = typedefs[1]
+
+if cp.found()
+  run_target('update-typedefs',
+    command: [cp, typedefs_merged, '@SOURCE_ROOT@/src/tools/pgindent/typedefs.list'],
+  )
+endif
diff --git a/src/tools/pgindent/typedefs_from_objs b/src/tools/pgindent/typedefs_from_objs
new file mode 100755
index 00000000000..821c84cb3d8
--- /dev/null
+++ b/src/tools/pgindent/typedefs_from_objs
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Tool to extract typedefs from object files.
+
+import argparse
+import re
+import shutil
+import subprocess
+import sys
+
+parser = argparse.ArgumentParser(
+    description='generate typedefs file for a set of object files')
+
+parser.add_argument('--output', type=argparse.FileType('w'), required=True)
+parser.add_argument('--host', type=str, required=True)
+parser.add_argument('input', nargs='*')
+
+args = parser.parse_args()
+
+if args.host == 'linux':
+    find_td_re = re.compile(r'^[^\n]+TAG_typedef\)\n[^\n]+DW_AT_name\s*:\s*\([^\)]+\): ([^\n]+)$', re.MULTILINE)
+    # FIXME: should probably be set by the caller? Except that the same binary
+    # name behaves very differently on different platforms :/
+    cmd = [shutil.which('objdump'), '-Wi']
+elif args.host == 'darwin':
+    find_td_re = re.compile(r'^[^\n]+TAG_typedef\n\s*DW_AT_type[^\n]+\n\s+DW_AT_name\s*\(\"([^\n]+)\"\)$', re.MULTILINE)
+    cmd = [shutil.which('dwarfdump')]
+else:
+    raise f'unsupported platform: {args.host}'
+
+lcmd = cmd + args.input
+sp = subprocess.run(lcmd, stdout=subprocess.PIPE, universal_newlines=True)
+if sp.returncode != 0:
+    print(f'{lcmd} failed with return code {sp.returncode}', file=sys.stderr)
+    sys.exit(sp.returncode)
+
+fa = find_td_re.findall(sp.stdout)
+
+for typedef in fa:
+    print(typedef, file=args.output)
+
+sys.exit(0)
-- 
2.38.0

