improve regexp handling in some functions

author b <rowerynaksiezycu@gmail.com>

Sat, 30 Sep 2023 20:14:15 +0000 (20:14 +0000)

committer b <rowerynaksiezycu@gmail.com>

Sat, 30 Sep 2023 20:14:15 +0000 (20:14 +0000)
author b <rowerynaksiezycu@gmail.com>
Sat, 30 Sep 2023 20:14:15 +0000 (20:14 +0000)
committer b <rowerynaksiezycu@gmail.com>
Sat, 30 Sep 2023 20:14:15 +0000 (20:14 +0000)
diff --git a/botm_common.pm b/botm_common.pm

index 82c62a626aa70c1f2aef798ee930c272916621b0..6d76275fa970d94e5a91e4824ae5f6d73d3ae542 100644 (file)
--- a/botm_common.pm
+++ b/botm_common.pm
@@ -25,7 +25,7 @@ use Encode ('encode', 'decode');
  
  use Exporter;
  
-our $VERSION     = '1.0.21';
+our $VERSION     = '1.0.22';
  our @ISA         = qw(Exporter);
  our @EXPORT      = ();
  our @EXPORT_OK   = (
@@ -113,6 +113,10 @@ sub make_temp_path {
         return join_path('/', $dir, $filename);
  }
  
+###################################
+##  ENCODING + SYSTEM FUNCTIONS  ##
+###################################
+
  sub system_encoded {
         (my $cmd, my @arg) = @_;
         my @newarg;
@@ -235,8 +239,8 @@ sub read_data_file {
                         next;
                 }
                 
-               $line =~ s/[\n]$//g;
-               $line =~ s/[\r]$//g;
+               $line =~ s/\n$//gs;
+               $line =~ s/\r$//gs;
                 
                 # Empty line - end of header.
                 if ($line eq ''){
@@ -244,15 +248,15 @@ sub read_data_file {
                 }
                 # Line starts with whitespace. It's a continuation of the previous line.
                 # Concatenate the field value, separated by newline.
-               elsif($line =~ /^[ \t](.*)$/){
+               elsif($line =~ /^[ \t]/){
                         if($lastname ne '') {
-                               $data{$lastname} .= "\n".$1;
+                               $data{$lastname} .= "\n".$';
                         }
                 }
                 # Line starts with a name followed by colon/equal sign. Save the value
-               elsif ($line =~ /^([ -9;-<>-~]+)((:[ \t])|=)(.*)$/) {
+               elsif ($line =~ /^([ -9;-<>-~]+)((:[ \t])|=)/s) {
                         $name = lc($1);
-                       $value = $4;
+                       $value = $';
                         
                         $data{$name} = $value;
                         
@@ -317,8 +321,8 @@ sub write_data_file {
                         }
                         my $value = $data->{$ind};
                         # convert newlines - add spaces at continuation line
-                       $value =~ s/\r//g;
-                       $value =~ s/\n/\n /g;
+                       $value =~ s/(\r)?\n/\n /g;
+                       $value =~ s/\r/\n /g;
                         print $fh "$name: $value\n";
                 }
         }
@@ -432,8 +436,8 @@ sub read_header_file {
         my $lastname='';
         
         while (defined(my $line = <$fh>)) {
-               $line =~ s/[\n]$//g;
-               $line =~ s/[\r]$//g;
+               $line =~ s/\n$//gs;
+               $line =~ s/\r$//gs;
                 
                 if ($status_line) {
                         $line =~ /^([^ ]+) +([^ ]+)( +([^ ].*))?$/;
@@ -656,9 +660,8 @@ sub split_url {
         else {
                 $data{'authority'} = $url;
                 if ($data{'authority'} =~ m/[\/\?#]/g) {
-                       $ind = pos($data{'authority'})-1;
-                       $url = substr($data{'authority'}, $ind); 
-                       $data{'authority'} = substr($data{'authority'}, 0, $ind);
+                       $data{'authority'} = $`;
+                       $url = $&.$';
                 }
                 else {
                         $url = '';
@@ -677,9 +680,8 @@ sub split_url {
         }
         $data{'host'} =~ m/\[[^\]]*\]/g;
         if ($data{'host'} =~ m/:/g) {
-               $ind = pos($data{'host'})-1;
-               $data{'port'} = substr($data{'host'}, $ind+1);
-               $data{'host'} = substr($data{'host'}, 0, $ind);
+               $data{'host'} = $`;
+               $data{'port'} = $';
         }
         else {
                 $data{'port'} = '';
@@ -689,9 +691,8 @@ sub split_url {
         if (($url =~ /^\//) or $relative) {
                 $data{'path'} = $url;
                 if ($data{'path'} =~ m/[\?#]/g) {
-                       $ind = pos($data{'path'})-1;
-                       $url = substr($data{'path'}, $ind);
-                       $data{'path'} = substr($data{'path'}, 0, $ind);
+                       $data{'path'} = $`;
+                       $url = $&.$';
                 }
                 else {
                         $url = '';
@@ -703,7 +704,7 @@ sub split_url {
         
         # query
         if ($url =~ /^\?/) {
-               $data{'query'} = substr($url, 1);
+               $data{'query'} = $';
                 $ind = index($data{'query'}, '#');
                 if ($ind >= 0) {
                         $url = substr($data{'query'}, $ind);
@@ -719,7 +720,7 @@ sub split_url {
         
         # fragment
         if ($url =~ /^#/) {
-               $data{'fragment'} = substr($url, 1);
+               $data{'fragment'} = $';
         }
         else {
                 $data{'fragment'} = '';
@@ -3304,17 +3305,17 @@ sub html_entity_decode {
                         $d .= $1;
                         $t = $2;
                 }
-               elsif ($t =~ /^(\&#?[A-Za-z0-9]+;)(.*)$/s) { # correct encoded character
-                       $d .= html_entity_decode_1en($1);
-                       $t = $2;
+               elsif ($t =~ /^\&#?[A-Za-z0-9]+;/s) { # correct encoded character
+                       $d .= html_entity_decode_1en($&);
+                       $t = $';
                 }
-               elsif ($t =~ /^(\&[A-Za-z0-9]+)(.*)$/s) { # encoded character without ";"
-                       $d .= html_entity_decode_1en($1);
-                       $t = $2;
+               elsif ($t =~ /^\&[A-Za-z0-9]+/s) { # encoded character without ";"
+                       $d .= html_entity_decode_1en($&);
+                       $t = $';
                 }
-               elsif ($t =~ /^(\&)(.*)$/s) { # invalid "&"
-                       $d .= $1;
-                       $t = $2;
+               elsif ($t =~ /^\&/s) { # invalid "&"
+                       $d .= $&;
+                       $t = $';
                 }
                 else {  # nothing left to decode
                         $d .= $t;
@@ -3336,7 +3337,8 @@ sub html_entity_decode_1en {
         
         if ($en !~ /;$/) { # name without ";"
                 my $n = substr($en, 1);
-               # we HAVE TO iterate :/
+               # we HAVE TO iterate as we don't know where name terminates :/
+               # why they thought this is a good idea ?
                 foreach my $name (keys %{+HTML_ENTITY_CODE_INF}) {
                         if (index($n, $name) == 0) { # name (beginning of entire string) is valid:
                                 # decode
author	b <rowerynaksiezycu@gmail.com>
	Sat, 30 Sep 2023 20:14:15 +0000 (20:14 +0000)
committer	b <rowerynaksiezycu@gmail.com>
	Sat, 30 Sep 2023 20:14:15 +0000 (20:14 +0000)