From 1c21433837c5bc34100f04bbfeb8ce403844e91d Mon Sep 17 00:00:00 2001 From: b Date: Sun, 18 Jun 2023 08:17:03 +0000 Subject: [PATCH] Character encoding awareness --- configure.1.pl | 89 +++++++++++++++++++++++++++++++++++++++++--------- makefile | 6 ++-- makefile.1.mak | 4 +-- settings.txt | 2 +- 4 files changed, 79 insertions(+), 22 deletions(-) diff --git a/configure.1.pl b/configure.1.pl index 34972c1..c452067 100755 --- a/configure.1.pl +++ b/configure.1.pl @@ -55,12 +55,12 @@ # the settings from a file and then insert '#define' (C), 'use constant' # (Perl), or other statements into the files based on pattern matching. # -# This is of for configuration done at compile time. Not at run time. +# This is for configuration done at compile time. Not at run time. # There the program itself has to read the settings and make decisions. # # There also exist things like autoconf, automake, etc. see: # http://www.mrob.com/pub/comp/unix-building-history.html -# However for many project I'm currently dealing with this script +# However for many project I'm currently dealing with, this script # here is totally enough and so far I did not have to learn these tools. # They are still a black box (or even black magic) to me. # @@ -87,9 +87,20 @@ # * --i, --in, --input - all further file paths are for input files # * --o, --out, --output - all further file paths are for output files # * -- - all further file paths are for configuration files +# * --e=, --encoding= - sets the character encoding for input, output, +# and files +# * --ef=, --encoding-file= - sets character encoding for files only, +# overrides --e +# * --ecf, --ecnoding-configfile= - sets character encoding for configuration +# files only, overrides --ef # * anything else will be treated as a path to a file: input, output, # or configuration. By default configuration files are expected. # +# Unless defined otherwise, the character encoding for all files is UTF-8, +# and for standard input/output it's the encoding determined from locale. +# for parsing command line arguments and for opening files always +# system locale is used, this can not be overridded. +# # The script reads configuration from all configuration files given in the # command line parameters. The effect is similar to reading a single # concatenated file. @@ -358,6 +369,9 @@ # And afterwards the new tatget will be the default one. use strict; +use utf8; +use Encode::Locale ('decode_argv'); +use Encode ('encode', 'decode'); use constant MAX_DEPTH => 256; use constant KEYWORD_PATTERN => '[A-Za-z0-9_\-\.]+'; @@ -391,6 +405,14 @@ $cfg{REPLACE_LINE()} = DEFAULT_REPLACE_LINE; $cfg{REPLACE_KEYWORD()} = DEFAULT_REPLACE_KEYWORD; $cfg{PATH_SEPARATOR()} = DEFAULT_PATH_SEPARATOR; +my $encoding = ''; +my $encoding_file = ''; +my $encoding_configfile = ''; +my $encoding_stdin; +my $encoding_stdout; + +decode_argv(); + foreach my $arg (@ARGV) { if ($arg =~ /^--(.*)$/) { # option $arg = $1; @@ -414,6 +436,15 @@ foreach my $arg (@ARGV) { elsif ($arg =~ /^o(ut(put)?)?$/) { $file_type = 'o'; } + elsif ($arg = /^e(ncoding)?=(.*)$/) { + $encoding = $2; + } + elsif ($arg = /^(ef)|(encoding-file)=(.*)$/) { + $encoding_file = $3; + } + elsif ($arg = /^(ecf?)|(encoding-configfile)=(.*)$/) { + $encoding_configfile = $3; + } elsif ($arg eq '') { $file_type = ''; } @@ -434,8 +465,28 @@ foreach my $arg (@ARGV) { } } +if ($encoding eq '') { + $encoding = 'UTF-8'; + $encoding_stdin = 'console_in'; + $encoding_stdout = 'console_out'; +} +else { + $encoding_stdin = $encoding; + $encoding_stdout = $encoding; +} +if ($encoding_file eq '') { + $encoding_file = $encoding; +} +if ($encoding_configfile eq '') { + $encoding_configfile = $encoding_file; +} + +binmode STDIN, ":encoding($encoding_stdin)"; +binmode STDOUT, ":encoding($encoding_stdout)"; +binmode STDERR, ":encoding($encoding_stdout)"; + foreach my $file (@config_files) { - %cfg = parse_file($file, 0, %cfg); + %cfg = parse_file($file, $encoding_configfile, 0, %cfg); } unless ($debug_all) { $debug_enabled = 0; @@ -469,7 +520,7 @@ if (@input_files == 0) { while (@input_files > 0) { my $in = shift @input_files; my $out = shift @output_files; - convert_file($in, $out); + convert_file($in, $out, $encoding_file); } sub format_cfg { @@ -541,6 +592,8 @@ sub escape { (my $text, my $match) = @_; unless (defined $match) { $match = '[\\\\\\\'\\\"]'; + # [ \ \ \ ' \ "] + # [ \ ' "] } my $outcome = ''; @@ -562,17 +615,21 @@ sub escape { return $outcome; } -# NOT UNICODE AWARE +# UTF-8 only +# TODO: this is almost duplicated from the common library sub urlencode { (my $text, my $match) = @_; unless (defined $match) { $match = '[^0-9A-Za-z.~\-_]' } - + my $outcome = ''; foreach my $ch (split('', $text)) { if ($ch =~ $match) { - $outcome .= sprintf('%%%02hX',ord($ch)); + my $enc = encode('UTF-8', $ch); + foreach my $b (split('', $enc)) { + $outcome .= sprintf('%%%02hX',ord($b)); + } } else { $outcome .= $ch; @@ -581,8 +638,7 @@ sub urlencode { return $outcome; } -# NOT UNICODE AWARE -# (but there no real need to escaping non-ascii characters) +# TODO: this is almost duplicated from the common library sub entityencode { (my $text, my $match) = @_; unless (defined $match) { @@ -601,6 +657,7 @@ sub entityencode { return $outcome; } +# TODO: this is duplicated from the common library sub join_path { (my $joiner, my @segments) = @_; @@ -905,7 +962,7 @@ sub parse_value { } sub parse_file { - (my $path, my $depth, my %cfg) = @_; + (my $path, my $encoding, my $depth, my %cfg) = @_; if ($depth >= MAX_DEPTH) { print STDERR "Too deep.\n"; @@ -915,7 +972,7 @@ sub parse_file { print_debug($depth, "PARSE FILE $path"); my $file; - unless (open $file, "<", $path) { + unless (open $file, "<:encoding($encoding)", encode('locale_fs', $path)) { print STDERR "Cannot open configfile $path.\n"; exit 2; } @@ -955,7 +1012,7 @@ sub parse_file { { my $path = parse_value($1, $depth+1, %cfg); print_debug($depth, "INCLUDE $path"); - %cfg = parse_file($path, $depth+1, %cfg); + %cfg = parse_file($path, $encoding, $depth+1, %cfg); $name = ''; $value = ''; } @@ -979,7 +1036,7 @@ sub parse_file { } sub convert_file { - (my $in, my $out) = @_; + (my $in, my $out, my $encoding) = @_; my $ref_in = ref($in); my $ref_out = ref($out); @@ -996,7 +1053,7 @@ sub convert_file { unless ($ref_in) { my $path = $in; $in = undef; - unless (open $in, '<', $path) { + unless (open $in, "<:encoding($encoding)", encode('locale_fs', $path)) { print STDERR "Cannot open input file $path.\n"; exit 2; } @@ -1004,8 +1061,8 @@ sub convert_file { unless ($ref_out) { my $path = $out; $out = undef; - unless (open $out, '>', $path) { - print STDERR "Cannot open input file $path.\n"; + unless (open $out, ">:encoding($encoding)", encode('locale_fs', $path)) { + print STDERR "Cannot open output file $path.\n"; unless ($ref_in) { close($in) } diff --git a/makefile b/makefile index 1ab71ea..7c71f4a 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ endif CONFIGFILE = settings-$(TARGET).txt settings.txt DEFAULT_CONFIGFILE = settings-$(DEFAULT_TARGET).txt settings.txt -PERL = /usr/bin/perl +PERL = /usr/bin/perl CHMOD = /usr/bin/chmod CP = /usr/bin/cp RM = /usr/bin/rm @@ -28,10 +28,10 @@ CONFIGURE = $(PERL) ./configure.1.pl all: exec makefile: makefile.1.mak configure.1.pl $(CONFIGFILE) - $(CONFIGURE) $(CONFIGFILE) < makefile.1.mak > makefile + $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in makefile.1.mak --out makefile configure.pl: configure.1.pl - $(CONFIGURE) $(CONFIGFILE) < configure.1.pl > configure.pl + $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in configure.1.pl --out configure.pl exec: configure.pl $(CHMOD) +x configure.pl diff --git a/makefile.1.mak b/makefile.1.mak index 3f8ac08..7596466 100644 --- a/makefile.1.mak +++ b/makefile.1.mak @@ -28,10 +28,10 @@ CONFIGURE = $(PERL) ./configure.1.pl all: exec makefile: makefile.1.mak configure.1.pl $(CONFIGFILE) - $(CONFIGURE) $(CONFIGFILE) < makefile.1.mak > makefile + $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in makefile.1.mak --out makefile configure.pl: configure.1.pl - $(CONFIGURE) $(CONFIGFILE) < configure.1.pl > configure.pl + $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in configure.1.pl --out configure.pl exec: configure.pl $(CHMOD) +x configure.pl diff --git a/settings.txt b/settings.txt index 5a1aa6e..067231b 100644 --- a/settings.txt +++ b/settings.txt @@ -4,7 +4,7 @@ RUN_PERL = @_SHEBANG($perl) make_target = TARGET = $target -make_perl = PERL = $perl +make_perl = PERL = $perl make_chmod = CHMOD = $chmod make_cp = CP = $cp make_rm = RM = $rm -- 2.30.2