From 6b789b4bd0ac1d87d3a2d8598ad229298e2c4777 Mon Sep 17 00:00:00 2001 From: b Date: Tue, 29 Aug 2023 22:31:42 +0000 Subject: [PATCH] rework HTML entity encoding --- botm_common.pm | 112 +++++++++++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 40 deletions(-) diff --git a/botm_common.pm b/botm_common.pm index 03518da..a479feb 100644 --- a/botm_common.pm +++ b/botm_common.pm @@ -25,7 +25,7 @@ use Encode ('encode', 'decode'); use Exporter; -our $VERSION = '1.0.17'; +our $VERSION = '1.0.18'; our @ISA = qw(Exporter); our @EXPORT = (); our @EXPORT_OK = ( @@ -831,9 +831,9 @@ sub url_query_decode { # sub join_url_host { # (my $encoding, my @segments) = @_; - + # # my $host = ''; - + # # foreach my $segment (@segments) { # if ($host ne '') { # $host = '.'.$host; @@ -844,7 +844,6 @@ sub url_query_decode { # } - ############################ ## HTML ENTITY ENCODING ## ############################ @@ -3107,16 +3106,12 @@ use constant HTML_ENTITY_CODE_INF => { # in decimal form. # only reserved characters will be escaped. # $t is the text to encode +# if $non_ascii is true then every non-ASCII character will be escaped # if $all is true then every character will be escaped. sub html_entity_encode_dec { - (my $t, my $all) = @_; + (my $t, my $non_ascii, my $all) = @_; - if ($all) { - $t =~ s/(.)/html_entity_encode_1ch_dec($1)/egs; - } - else { - $t =~ s/([\"\&<=>])/html_entity_encode_1ch_dec($1)/egs; - } + $t =~ s/(.)/html_entity_encode_1ch_dec($1,$non_ascii,$all)/egs; return $t; } @@ -3124,16 +3119,12 @@ sub html_entity_encode_dec { # in hexadecimal form. # only reserved characters will be escaped. # $t is the text to encode +# if $non_ascii is true then every non-ASCII character will be escaped # if $all is true then every character will be escaped. sub html_entity_encode_hex { - (my $t, my $all) = @_; + (my $t, my $non_ascii, my $all) = @_; - if ($all) { - $t =~ s/(.)/html_entity_encode_1ch_hex($1)/egs; - } - else { - $t =~ s/([\"\&<=>])/html_entity_encode_1ch_hex($1)/egs; - } + $t =~ s/(.)/html_entity_encode_1ch_hex($1,$non_ascii,$all)/egs; return $t; } @@ -3141,52 +3132,83 @@ sub html_entity_encode_hex { # by name. # only reserved characters will be escaped. # $t is the text to encode -# if $all is true then every character which has a name will be escaped, -# and additionally if a character has more than 1 possible name, then +# if $non_ascii is true then every non-ASCII character will be escaped +# if $all is true then every character will be escaped. +# if $fallback is 'dec' then character without name will be escaped in +# decimal form instead +# or if $fallback is 'hex' then character without name will be escaped +# in hexadecimal form instead +# or otherwise they will not be escaped +# if a character has more than 1 possible name, then # the name will be chosen unpredicatbly. sub html_entity_encode_name { - (my $t, my $all) = @_; + (my $t, my $non_ascii, my $all, my $fallback) = @_; - if ($all) { - $t =~ s/(.)/html_entity_encode_1ch_name($1,1)/egs; - } - else { - $t =~ s/([\"\&<=>])/html_entity_encode_1ch_name($1,0)/egs; - } + $t =~ s/(.)/html_entity_encode_1ch_name($1,$non_ascii,$all,$fallback)/egs; return $t; } # html_entity_encode_dec() escapes a single character # as an HTML entity reference in decimal form. # $ch is the chatacter to encode +# if $non_ascii is true then every non-ASCII character will be escaped +# if $all is true then every character will be escaped. sub html_entity_encode_1ch_dec { - (my $ch) = @_; + (my $ch, my $non_ascii, my $all) = @_; - $ch =~ s/(.)/sprintf('&#%02u;',ord($1))/egs; - return $ch; + my $n = ord($ch); + if ( + $all or + ($non_ascii and ($n > 0x7F)) or + ($ch =~ /[\"\&<=>]/) + ) { + return sprintf('&#%02u;', $n); + } + else { + return $ch; + } } # html_entity_encode_1ch_hex() escapes a single character # as an HTML entity reference in hexadecimal form. # $ch is the chatacter to encode +# if $non_ascii is true then every non-ASCII character will be escaped +# if $all is true then every character will be escaped. sub html_entity_encode_1ch_hex { - (my $ch) = @_; + (my $ch, my $non_ascii, my $all) = @_; - $ch =~ s/(.)/sprintf('&#x%02X;',ord($1))/egs; - return $ch; + my $n = ord($ch); + if ( + $all or + ($non_ascii and ($n > 0x7F)) or + ($ch =~ /[\"\&<=>]/) + ) { + return sprintf('&#x%02X;', $n); + } + else { + return $ch; + } } # html_entity_encode_1ch_name() escapes a single character # as an HTML entity reference by name. # only a reserved character will be escaped. # $ch is the chatacter to encode -# if $all is true then any character which has a name will be escaped, -# and additionally if the character has more than 1 possible name, then +# if $non_ascii is true then every non-ASCII character will be escaped +# if $all is true then every character will be escaped. +# if $fallback is 'dec' then character without name will be escaped in +# decimal form instead +# or if $fallback is 'hex' then character without name will be escaped +# in hexadecimal form instead +# or otherwise they will not be escaped +# if a character has more than 1 possible name, then # the name will be chosen unpredicatbly. sub html_entity_encode_1ch_name { - (my $ch, my $all) = @_; + (my $ch, my $non_ascii, my $all, my $fallback) = @_; + + my $n = ord($ch); - unless ($all) { + if ($ch =~ /[\"\&<=>]/) { if ($ch eq '"') { return '"'; } @@ -3206,16 +3228,26 @@ sub html_entity_encode_1ch_name { return $ch; } } - else { - my $n = ord($ch); + elsif ( + $all or + ($non_ascii and ($n > 0x7F)) + ) { foreach my $name (keys %{+HTML_ENTITY_CODE}) { my @code = @{HTML_ENTITY_CODE->{$name}}; if (($code[0] == $n) and (@code == 1)) { return "&$name;"; } } - return $ch; + + if ($fallback eq 'hex') { + return html_entity_encode_1ch_hex($ch, $non_ascii, $all); + } + elsif ($fallback eq 'dec') { + return html_entity_encode_1ch_dec($ch, $non_ascii, $all); + } } + + return $ch; } # html_entity_decode() decodes a text escaped by HTML entity references -- 2.30.2