# the settings from a file and then insert '#define' (C), 'use constant'
# (Perl), or other statements into the files based on pattern matching.
#
-# This is of for configuration done at compile time. Not at run time.
+# This is for configuration done at compile time. Not at run time.
# There the program itself has to read the settings and make decisions.
#
# There also exist things like autoconf, automake, etc. see:
# http://www.mrob.com/pub/comp/unix-building-history.html
-# However for many project I'm currently dealing with this script
+# However for many project I'm currently dealing with, this script
# here is totally enough and so far I did not have to learn these tools.
# They are still a black box (or even black magic) to me.
#
# * --i, --in, --input - all further file paths are for input files
# * --o, --out, --output - all further file paths are for output files
# * -- - all further file paths are for configuration files
+# * --e=, --encoding= - sets the character encoding for input, output,
+# and files
+# * --ef=, --encoding-file= - sets character encoding for files only,
+# overrides --e
+# * --ecf, --ecnoding-configfile= - sets character encoding for configuration
+# files only, overrides --ef
# * anything else will be treated as a path to a file: input, output,
# or configuration. By default configuration files are expected.
#
+# Unless defined otherwise, the character encoding for all files is UTF-8,
+# and for standard input/output it's the encoding determined from locale.
+# for parsing command line arguments and for opening files always
+# system locale is used, this can not be overridded.
+#
# The script reads configuration from all configuration files given in the
# command line parameters. The effect is similar to reading a single
# concatenated file.
# And afterwards the new tatget will be the default one.
use strict;
+use utf8;
+use Encode::Locale ('decode_argv');
+use Encode ('encode', 'decode');
use constant MAX_DEPTH => 256;
use constant KEYWORD_PATTERN => '[A-Za-z0-9_\-\.]+';
$cfg{REPLACE_KEYWORD()} = DEFAULT_REPLACE_KEYWORD;
$cfg{PATH_SEPARATOR()} = DEFAULT_PATH_SEPARATOR;
+my $encoding = '';
+my $encoding_file = '';
+my $encoding_configfile = '';
+my $encoding_stdin;
+my $encoding_stdout;
+
+decode_argv();
+
foreach my $arg (@ARGV) {
if ($arg =~ /^--(.*)$/) { # option
$arg = $1;
elsif ($arg =~ /^o(ut(put)?)?$/) {
$file_type = 'o';
}
+ elsif ($arg = /^e(ncoding)?=(.*)$/) {
+ $encoding = $2;
+ }
+ elsif ($arg = /^(ef)|(encoding-file)=(.*)$/) {
+ $encoding_file = $3;
+ }
+ elsif ($arg = /^(ecf?)|(encoding-configfile)=(.*)$/) {
+ $encoding_configfile = $3;
+ }
elsif ($arg eq '') {
$file_type = '';
}
}
}
+if ($encoding eq '') {
+ $encoding = 'UTF-8';
+ $encoding_stdin = 'console_in';
+ $encoding_stdout = 'console_out';
+}
+else {
+ $encoding_stdin = $encoding;
+ $encoding_stdout = $encoding;
+}
+if ($encoding_file eq '') {
+ $encoding_file = $encoding;
+}
+if ($encoding_configfile eq '') {
+ $encoding_configfile = $encoding_file;
+}
+
+binmode STDIN, ":encoding($encoding_stdin)";
+binmode STDOUT, ":encoding($encoding_stdout)";
+binmode STDERR, ":encoding($encoding_stdout)";
+
foreach my $file (@config_files) {
- %cfg = parse_file($file, 0, %cfg);
+ %cfg = parse_file($file, $encoding_configfile, 0, %cfg);
}
unless ($debug_all) {
$debug_enabled = 0;
while (@input_files > 0) {
my $in = shift @input_files;
my $out = shift @output_files;
- convert_file($in, $out);
+ convert_file($in, $out, $encoding_file);
}
sub format_cfg {
(my $text, my $match) = @_;
unless (defined $match) {
$match = '[\\\\\\\'\\\"]';
+ # [ \ \ \ ' \ "]
+ # [ \ ' "]
}
my $outcome = '';
return $outcome;
}
-# NOT UNICODE AWARE
+# UTF-8 only
+# TODO: this is almost duplicated from the common library
sub urlencode {
(my $text, my $match) = @_;
unless (defined $match) {
$match = '[^0-9A-Za-z.~\-_]'
}
-
+
my $outcome = '';
foreach my $ch (split('', $text)) {
if ($ch =~ $match) {
- $outcome .= sprintf('%%%02hX',ord($ch));
+ my $enc = encode('UTF-8', $ch);
+ foreach my $b (split('', $enc)) {
+ $outcome .= sprintf('%%%02hX',ord($b));
+ }
}
else {
$outcome .= $ch;
return $outcome;
}
-# NOT UNICODE AWARE
-# (but there no real need to escaping non-ascii characters)
+# TODO: this is almost duplicated from the common library
sub entityencode {
(my $text, my $match) = @_;
unless (defined $match) {
return $outcome;
}
+# TODO: this is duplicated from the common library
sub join_path {
(my $joiner, my @segments) = @_;
}
sub parse_file {
- (my $path, my $depth, my %cfg) = @_;
+ (my $path, my $encoding, my $depth, my %cfg) = @_;
if ($depth >= MAX_DEPTH) {
print STDERR "Too deep.\n";
print_debug($depth, "PARSE FILE $path");
my $file;
- unless (open $file, "<", $path) {
+ unless (open $file, "<:encoding($encoding)", encode('locale_fs', $path)) {
print STDERR "Cannot open configfile $path.\n";
exit 2;
}
{
my $path = parse_value($1, $depth+1, %cfg);
print_debug($depth, "INCLUDE $path");
- %cfg = parse_file($path, $depth+1, %cfg);
+ %cfg = parse_file($path, $encoding, $depth+1, %cfg);
$name = '';
$value = '';
}
}
sub convert_file {
- (my $in, my $out) = @_;
+ (my $in, my $out, my $encoding) = @_;
my $ref_in = ref($in);
my $ref_out = ref($out);
unless ($ref_in) {
my $path = $in;
$in = undef;
- unless (open $in, '<', $path) {
+ unless (open $in, "<:encoding($encoding)", encode('locale_fs', $path)) {
print STDERR "Cannot open input file $path.\n";
exit 2;
}
unless ($ref_out) {
my $path = $out;
$out = undef;
- unless (open $out, '>', $path) {
- print STDERR "Cannot open input file $path.\n";
+ unless (open $out, ">:encoding($encoding)", encode('locale_fs', $path)) {
+ print STDERR "Cannot open output file $path.\n";
unless ($ref_in) {
close($in)
}