# Retrieve examples of names.
# curl -O https://github.com/philipperemy/name-dataset/blob/master/names_dataset/v1/last_names.all.txt

# Process the names using Daitch-Mokotoff at https://stevemorse.org/census/soundexbatch.html
# and save the result (right column) in the file last_names.morse-soundex.txt

# Now prepare for comparison with our implementation.
paste last_names.all.txt last_names.morse-soundex.txt | perl -ne 'chomp; ($n, $c) = split(/\t/); print "$n\t", join(" ", sort split(/ /, $c)), "\n";' > last_names.morse.txt
cat -n last_names.all.txt > last_names.linenum.txt

psql postgres postgres <<EOF
CREATE EXTENSION IF NOT EXISTS daitch_mokotoff;

CREATE TEMPORARY TABLE last_name (
  n integer,
  last_name text
);

\COPY last_name FROM last_names.linenum.txt
\COPY (SELECT last_name, (SELECT string_agg(c, ' ' ORDER BY c) FROM regexp_split_to_table(daitch_mokotoff(last_name), ' ') AS c) FROM last_name ORDER BY n) TO last_names.postgres.txt
EOF

# Compare the results.
diff -u last_names.morse.txt last_names.postgres.txt | less
