Character encoding awareness

author b <rowerynaksiezycu@gmail.com>

Sun, 18 Jun 2023 08:17:03 +0000 (08:17 +0000)

committer b <rowerynaksiezycu@gmail.com>

Sun, 18 Jun 2023 08:17:03 +0000 (08:17 +0000)
author b <rowerynaksiezycu@gmail.com>
Sun, 18 Jun 2023 08:17:03 +0000 (08:17 +0000)
committer b <rowerynaksiezycu@gmail.com>
Sun, 18 Jun 2023 08:17:03 +0000 (08:17 +0000)
diff --git a/configure.1.pl b/configure.1.pl

index 34972c13d99bbcd9785cf8615ac0ff54bddeaa48..c4520676fe76e4f50a8742fc2a1e2c5c83dff9a4 100755 (executable)
--- a/configure.1.pl
+++ b/configure.1.pl
@@ -55,12 +55,12 @@
  #    the settings from a file and then insert '#define' (C), 'use constant'
  #    (Perl), or other statements into the files based on pattern matching.
  #
-#    This is of for configuration done at compile time. Not at run time.
+#    This is for configuration done at compile time. Not at run time.
  #    There the program itself has to read the settings and make decisions.
  #
  #    There also exist things like autoconf, automake, etc. see:
  #    http://www.mrob.com/pub/comp/unix-building-history.html
-#    However for many project I'm currently dealing with this script
+#    However for many project I'm currently dealing with, this script
  #    here is totally enough and so far I did not have to learn these tools.
  #    They are still a black box (or even black magic) to me.
  #
@@ -87,9 +87,20 @@
  #   * --i, --in, --input - all further file paths are for input files
  #   * --o, --out, --output - all further file paths are for output files
  #   * -- - all further file paths are for configuration files
+#   * --e=, --encoding= - sets the character encoding for input, output,
+#     and files
+#   * --ef=, --encoding-file= - sets character encoding for files only,
+#     overrides --e
+#   * --ecf, --ecnoding-configfile= - sets character encoding for configuration
+#     files only, overrides --ef
  # * anything else will be treated as a path to a file: input, output,
  #   or configuration. By default configuration files are expected.
  #
+# Unless defined otherwise, the character encoding for all files is UTF-8,
+# and for standard input/output it's the encoding determined from locale.
+# for parsing command line arguments and for opening files always 
+# system locale is used, this can not be overridded.
+#
  # The script reads configuration from all configuration files given in the
  # command line parameters. The effect is similar to reading a single
  # concatenated file.
@@ -358,6 +369,9 @@
  # And afterwards the new tatget will be the default one.
  
  use strict;
+use utf8;
+use Encode::Locale ('decode_argv');
+use Encode ('encode', 'decode');
  
  use constant MAX_DEPTH               => 256;
  use constant KEYWORD_PATTERN         => '[A-Za-z0-9_\-\.]+';
@@ -391,6 +405,14 @@ $cfg{REPLACE_LINE()}    = DEFAULT_REPLACE_LINE;
  $cfg{REPLACE_KEYWORD()} = DEFAULT_REPLACE_KEYWORD;
  $cfg{PATH_SEPARATOR()}  = DEFAULT_PATH_SEPARATOR;
  
+my $encoding = '';
+my $encoding_file = '';
+my $encoding_configfile = '';
+my $encoding_stdin;
+my $encoding_stdout;
+
+decode_argv();
+
  foreach my $arg (@ARGV) {
         if ($arg =~ /^--(.*)$/) { # option
                 $arg = $1;
@@ -414,6 +436,15 @@ foreach my $arg (@ARGV) {
                 elsif ($arg =~ /^o(ut(put)?)?$/) {
                         $file_type = 'o';
                 }
+               elsif ($arg = /^e(ncoding)?=(.*)$/) {
+                       $encoding = $2;
+               }
+               elsif ($arg = /^(ef)|(encoding-file)=(.*)$/) {
+                       $encoding_file = $3;
+               }
+               elsif ($arg = /^(ecf?)|(encoding-configfile)=(.*)$/) {
+                       $encoding_configfile = $3;
+               }
                 elsif ($arg eq '') {
                         $file_type = '';
                 }
@@ -434,8 +465,28 @@ foreach my $arg (@ARGV) {
         }
  }
  
+if ($encoding eq '') {
+       $encoding        = 'UTF-8';
+       $encoding_stdin  = 'console_in';
+       $encoding_stdout = 'console_out';
+}
+else {
+       $encoding_stdin  = $encoding;
+       $encoding_stdout = $encoding;
+}
+if ($encoding_file eq '') {
+       $encoding_file = $encoding;
+}
+if ($encoding_configfile eq '') {
+       $encoding_configfile = $encoding_file;
+}
+
+binmode STDIN,  ":encoding($encoding_stdin)";
+binmode STDOUT, ":encoding($encoding_stdout)";
+binmode STDERR, ":encoding($encoding_stdout)";
+
  foreach my $file (@config_files) {
-       %cfg = parse_file($file, 0, %cfg);
+       %cfg = parse_file($file, $encoding_configfile, 0, %cfg);
  }
  unless ($debug_all) {
         $debug_enabled = 0;
@@ -469,7 +520,7 @@ if (@input_files == 0) {
  while (@input_files > 0) {
         my $in = shift @input_files;
         my $out = shift @output_files;
-       convert_file($in, $out);
+       convert_file($in, $out, $encoding_file);
  }
  
  sub format_cfg {
@@ -541,6 +592,8 @@ sub escape {
         (my $text, my $match) = @_;
         unless (defined $match) {
                 $match = '[\\\\\\\'\\\"]';
+               #         [ \ \ \ ' \ "]
+               #         [   \   '   "]
         }
         
         my $outcome = '';
@@ -562,17 +615,21 @@ sub escape {
         return $outcome;
  }
  
-# NOT UNICODE AWARE
+# UTF-8 only
+# TODO: this is almost duplicated from the common library
  sub urlencode {
         (my $text, my $match) = @_;
         unless (defined $match) {
                 $match = '[^0-9A-Za-z.~\-_]'
         }
-       
+               
         my $outcome = '';
         foreach my $ch (split('', $text)) {
                 if ($ch =~ $match) {
-                       $outcome .= sprintf('%%%02hX',ord($ch));
+                       my $enc = encode('UTF-8', $ch);
+                       foreach my $b (split('', $enc)) {
+                               $outcome .= sprintf('%%%02hX',ord($b));
+                       }
                 }
                 else {
                         $outcome .= $ch;
@@ -581,8 +638,7 @@ sub urlencode {
         return $outcome;
  }
  
-# NOT UNICODE AWARE
-# (but there no real need to escaping non-ascii characters)
+# TODO: this is almost duplicated from the common library
  sub entityencode {
         (my $text, my $match) = @_;
         unless (defined $match) {
@@ -601,6 +657,7 @@ sub entityencode {
         return $outcome;
  }
  
+# TODO: this is duplicated from the common library
  sub join_path {
         (my $joiner, my @segments) = @_;
         
@@ -905,7 +962,7 @@ sub parse_value {
  }
  
  sub parse_file {
-       (my $path, my $depth, my %cfg) = @_;
+       (my $path, my $encoding, my $depth, my %cfg) = @_;
         
         if ($depth >= MAX_DEPTH) {
                 print STDERR "Too deep.\n";
@@ -915,7 +972,7 @@ sub parse_file {
         print_debug($depth, "PARSE FILE $path");
         
         my $file;
-       unless (open $file, "<", $path) {
+       unless (open $file, "<:encoding($encoding)", encode('locale_fs', $path)) {
                 print STDERR "Cannot open configfile $path.\n";
                 exit 2;
         }
@@ -955,7 +1012,7 @@ sub parse_file {
                 {
                         my $path = parse_value($1, $depth+1, %cfg);
                         print_debug($depth, "INCLUDE $path");
-                       %cfg = parse_file($path, $depth+1, %cfg);
+                       %cfg = parse_file($path, $encoding, $depth+1, %cfg);
                         $name = '';
                         $value = '';
                 }
@@ -979,7 +1036,7 @@ sub parse_file {
  }
  
  sub convert_file {
-       (my $in, my $out) = @_;
+       (my $in, my $out, my $encoding) = @_;
         
         my $ref_in = ref($in);
         my $ref_out = ref($out);
@@ -996,7 +1053,7 @@ sub convert_file {
         unless ($ref_in) {
                 my $path = $in;
                 $in = undef;
-               unless (open $in, '<', $path) {
+               unless (open $in, "<:encoding($encoding)", encode('locale_fs', $path)) {
                         print STDERR "Cannot open input file $path.\n";
                         exit 2;
                 }
@@ -1004,8 +1061,8 @@ sub convert_file {
         unless ($ref_out) {
                 my $path = $out;
                 $out = undef;
-               unless (open $out, '>', $path) {
-                       print STDERR "Cannot open input file $path.\n";
+               unless (open $out, ">:encoding($encoding)", encode('locale_fs', $path)) {
+                       print STDERR "Cannot open output file $path.\n";
                         unless ($ref_in) {
                                 close($in)
                         }
diff --git a/makefile b/makefile

index 1ab71eafc6b5715ac579d56d912026589f79946f..7c71f4a8302d8858318943a679e484e442de8cf3 100644 (file)
--- a/makefile
+++ b/makefile
@@ -11,7 +11,7 @@ endif
  CONFIGFILE = settings-$(TARGET).txt settings.txt
  DEFAULT_CONFIGFILE = settings-$(DEFAULT_TARGET).txt settings.txt
  
-PERL    = /usr/bin/perl
+PERL  = /usr/bin/perl
  CHMOD = /usr/bin/chmod
  CP    = /usr/bin/cp
  RM    = /usr/bin/rm
@@ -28,10 +28,10 @@ CONFIGURE = $(PERL) ./configure.1.pl
  all: exec
         
  makefile: makefile.1.mak configure.1.pl $(CONFIGFILE)
-       $(CONFIGURE) $(CONFIGFILE) < makefile.1.mak > makefile
+       $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in makefile.1.mak --out makefile
  
  configure.pl: configure.1.pl
-       $(CONFIGURE) $(CONFIGFILE) < configure.1.pl > configure.pl
+       $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in configure.1.pl --out configure.pl
  
  exec: configure.pl
         $(CHMOD) +x configure.pl
diff --git a/makefile.1.mak b/makefile.1.mak

index 3f8ac086cb9fe8240fce1484878403433a9ea133..75964662e7d124677fee39cd23951d413bd5ade0 100644 (file)
--- a/makefile.1.mak
+++ b/makefile.1.mak
@@ -28,10 +28,10 @@ CONFIGURE = $(PERL) ./configure.1.pl
  all: exec
         
  makefile: makefile.1.mak configure.1.pl $(CONFIGFILE)
-       $(CONFIGURE) $(CONFIGFILE) < makefile.1.mak > makefile
+       $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in makefile.1.mak --out makefile
  
  configure.pl: configure.1.pl
-       $(CONFIGURE) $(CONFIGFILE) < configure.1.pl > configure.pl
+       $(CONFIGURE) $(CONFIGFILE) --ef=UTF-8 --in configure.1.pl --out configure.pl
  
  exec: configure.pl
         $(CHMOD) +x configure.pl
diff --git a/settings.txt b/settings.txt

index 5a1aa6e4c355b478e4258b99f7b73afc5f17e91b..067231b8b5c40c2d7d5cf850237ae75812267959 100644 (file)
--- a/settings.txt
+++ b/settings.txt
@@ -4,7 +4,7 @@ RUN_PERL = @_SHEBANG($perl)
  
  make_target = TARGET = $target
  
-make_perl  = PERL    = $perl
+make_perl  = PERL  = $perl
  make_chmod = CHMOD = $chmod
  make_cp    = CP    = $cp
  make_rm    = RM    = $rm
author	b <rowerynaksiezycu@gmail.com>
	Sun, 18 Jun 2023 08:17:03 +0000 (08:17 +0000)
committer	b <rowerynaksiezycu@gmail.com>
	Sun, 18 Jun 2023 08:17:03 +0000 (08:17 +0000)
configure.1.pl		patch \| blob \| history
makefile		patch \| blob \| history
makefile.1.mak		patch \| blob \| history
settings.txt		patch \| blob \| history