From: b Date: Wed, 18 Nov 2015 21:59:21 +0000 (+0000) Subject: Archiving finished for now. X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=6e3fdb354832c88e2b130fd673f2e092d486f0bf;p=yplom%2Fproxy Archiving finished for now. Removing old files from the archive. Renaming/compressing/removing of old logs. Added comments to explain the code. git-svn-id: svn://botcastle1b/yplom/proxy@10 05ce6ebb-7522-4a6e-a768-0026ae12be9f --- diff --git a/access.1.c b/access.1.c index 160d4b8..442b827 100644 --- a/access.1.c +++ b/access.1.c @@ -1,4 +1,9 @@ -//The SETUID wrapper. +// access.c is generated from access.1.c +// +// This is the wrapper for access.pl. +// It's run with SETUID to have accesss to some files where the www server +// should not. That's why it has a C wrapper. In modern systems running scripts +// directly with SETUID is considered unsafe and not allowed. #include #include diff --git a/access.1.pl b/access.1.pl index fee2b63..4cd3e2f 100644 --- a/access.1.pl +++ b/access.1.pl @@ -1,5 +1,30 @@ ###PERL; +# access.pl is generated from access.1.pl +# +# This is the Squid helper program used for access control, it detects if the +# proxy is unlocked for the user and tells Squid if it can open the tunnel or +# not. +# +# It's run with SETUID to have accesss to some files where the Squid server +# should not. That's why it has a C wrapper. In modern systems running scripts +# directly with SETUID is considered unsafe and not allowed. +# When running with SETUID the program also has to deal with Perl's built-in +# securuity. +# +# Any data comming from standard input, files, environment variables, command +# line is treated by Perl as unsafe. Any data created by operations on unsafe +# data is also unsafe. Perl will not allow to use such data to open files (for +# writing), launch programs, etc. +# That's why the data has to be validated and made safe, like this: +# if ($variable =~/^(some-regex)$/) { +# $variable = $1; +# } +# +# see also: +# http://wiki.squid-cache.org/Features/AddonHelpers#Access_Control_.28ACL.29 +# http://perldoc.perl.org/perlsec.html#Taint-mode + ###ACCESS_PATH; ###TIMEOUT_UNLOCK; ###TIMEOUT_INACT; @@ -15,15 +40,19 @@ while () { } $accesstime = time(); - # print '+++ '.$line."\n"; + # The input line is expected to have the IP. if ($line =~ /^([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)( .*)?$/) { $IP = $1; + + # If access file doesn't exist the proxy is not unlocked for this IP. $accesspath=ACCESS_PATH.$IP; if (! (-e $accesspath)) { - print "ERR'n"; + print "ERR\n"; } elsif (open ($accessfile,"+<",$accesspath)) { + # The access file has to be locked otherwise one process may read it while + # another one updates it. unless (flock ($accessfile, 2)) { close ($accessfile); print "ERR\n"; @@ -32,9 +61,12 @@ while () { $unlocktime=<$accessfile>; $lasttime=<$accessfile>; + # First line is the timestamp of proxy unlocking. Second line is the + # timestamp of last access. $unlocktime =~ s/[\r\n]//g; $lasttime =~ s/[\r\n]//g; + # The timestamps must be numbers. if ($unlocktime =~ /^([0-9]+)$/) { $unlocktime=int($1); } @@ -54,11 +86,14 @@ while () { } if ((abs($accesstime-$unlocktime)>$timeout_unlock) or (abs($accesstime-$lasttime)>$timeout_inact)){ + # If too much time passed proxy is not unlocked any more. Also the + # access file is now useless and will be removed. close ($accessfile); unlink $accesspath; print "ERR\n"; } else { + # the proxy is unlocked. The access file will now be updated. if (seek($accessfile, 0, 0)) { print $accessfile "$unlocktime\n$accesstime\n"; truncate ($accessfile , tell($accessfile)); @@ -68,7 +103,7 @@ while () { } } else { - print "BH\n"; + print "ERR\n"; } } else { diff --git a/cleararch.1.pl b/cleararch.1.pl new file mode 100644 index 0000000..63c1b5f --- /dev/null +++ b/cleararch.1.pl @@ -0,0 +1,67 @@ +###PERL; + +# cleararch is generated from cleararch.1.pl. +# +# This script recursively clears the archive from old files. Otherwise it would +# grow forever. + +use POSIX qw(strftime); + +###ARCH_PATH; +###TIMEOUT_ARCH; + +$time = time(); +print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n"; +cleardir(ARCH_PATH); + +sub cleardir { + (my $dirpath) = @_; + my $dir; + my $returnvalue = 1; + my $subpath; + my $subpathfull; + my @stat; + unless ( opendir ($dir, $dirpath)) { + print "Failed to open: $dirpath\/\n"; + return 0; + } + while ($subpath = readdir $dir) { + $subpathfull=$dirpath.$subpath; + if ($subpath =~ /^\.\.?$/) { + next; + } + if (-f $subpathfull) { + unless (@stat = stat $subpathfull) { + print "Stat fail: $subpathfull\n"; + $returnvalue = 0; + next; + } + if (abs($time - $stat[9]) > TIMEOUT_ARCH) { + unless (unlink $subpathfull) { + print "Failed to remove: $subpathfull\n"; + $returnvalue = 0; + next; + } + print "Removed: $subpathfull\n"; + } + else { + $returnvalue = 0; + } + } + elsif (-d $subpathfull) { + unless (cleardir($subpathfull.'/')) { + $returnvalue = 0; + next; + } + unless (rmdir($subpathfull.'/')) { + print "Failed to remove: $subpathfull\/\n"; + $returnvalue = 0; + next; + } + print "Removed: $subpathfull\/\n"; + } + } + closedir($dir); + return $returnvalue; +} +print "\n"; diff --git a/config.1.txt b/config.1.txt index f1c983b..f5b5182 100644 --- a/config.1.txt +++ b/config.1.txt @@ -1,3 +1,4 @@ +# config.txt is generated from config.1.txt ################################################################################ #copy this to your Apache2 configuration, #remember to make the server listen on these ports: @@ -23,8 +24,8 @@ ###CGI_ALIAS; SSLEngine on - SSLCertificateFile /etc/apache2/ssl/botm.crt - SSLCertificateKeyFile /etc/apache2/ssl/botm.key +###SSL_CERT; +###SSL_KEY; #SSLOptions +StdEnvVars @@ -61,3 +62,5 @@ coredump_dir /var/spool/squid ################################################################################ #Copy this to your crontab: ###RM_ACCESS_CRONTAB; +###CLEARARCH_CRONTAB; +###OLDLOGS_CRONTAB; diff --git a/configure.pl b/configure.pl index 3fcd1e7..a0900bc 100755 --- a/configure.pl +++ b/configure.pl @@ -1,5 +1,12 @@ #!/usr/bin/perl +# The proxy software, when run on a server, will use different directories, +# host names, tcp ports, etc. than the server on which this software was +# originally written. +# These things are defined in the file 'settings'. +# This script is called from the makefile. It reads the settings file and +# inserts the information in the source files. + unless ($ARGV[0]) { print STDERR "Configfile missing.\n"; exit 1; @@ -10,6 +17,8 @@ unless (open $configfile, "<", $ARGV[0]) { exit 2; } +# Read the config file, line format: +# some_name = some value # some comment while ($line = <$configfile>) { $line =~ s/[\r\n]//g; $line =~ s/#.*$//; @@ -22,9 +31,13 @@ while ($line = <$configfile>) { } close ($configfile); +# Now generate things to be inserted. + $def{'UNLOCK_LOG'} = "use constant UNLOCK_LOG => '".$set{'log_path'}."unlock.log';"; $def{'CURL_PATH'} = "use constant CURL_PATH => '".$set{'curl'}."';"; +$def{'GZIP_PATH'} = "use constant GZIP_PATH => '".$set{'gzip'}."';"; $def{'DATA_PATH'} = "use constant DATA_PATH => '".$set{'data_path'}."';"; +$def{'LOG_PATH'} = "use constant LOG_PATH => '".$set{'log_path'}."';"; $def{'TEMP_PATH'} = "use constant TEMP_PATH => '".$set{'tmp_path'}."';"; $def{'PASS_PATH'} = "use constant PASS_PATH => '".$set{'data_path'}."pass/';"; $def{'ARCH_PATH'} = "use constant ARCH_PATH => '".$set{'data_path'}."archive/';"; @@ -37,6 +50,10 @@ $def{'BLOCK_HOST'} = "use constant BLOCK_HOST => qr/".$set{'block $def{'BLOCK_PORT'} = 'use constant BLOCK_PORT => qr/^((0*'.$set{'http_proxy_port'}.')|('.$set{'https_proxy_port'}.')|('.$set{'ssl_proxy_port'}.'))$/;'; $def{'TIMEOUT_UNLOCK'} = "use constant TIMEOUT_UNLOCK => ".$set{'timeout_unlock'}.";"; $def{'TIMEOUT_INACT'} = "use constant TIMEOUT_INACT => ".$set{'timeout_inact'}.";"; +$def{'TIMEOUT_ARCH'} = "use constant TIMEOUT_ARCH => ".$set{'timeout_arch'}.";"; +$def{'LOG_SIZE_LIMIT'} = "use constant LOG_SIZE_LIMIT => ".$set{'log_size_limit'}.";"; +$def{'LOGS_UNCOMPRESSED'} = "use constant LOGS_UNCOMPRESSED => ".$set{'logs_uncompressed'}.";"; +$def{'LOGS_TOTAL'} = "use constant LOGS_TOTAL => ".$set{'logs_total'}.";"; $def{'REWRITE_URL'} = "use constant REWRITE_URL => '".$set{'https_proxy_domain'}.":".$set{'https_proxy_port'}."';"; $def{'PATH'} = "\$ENV{'PATH'} = '".$set{'path'}."';"; @@ -68,7 +85,11 @@ $def{'HTTP_PORT_SSL'} = 'http_port '.$set{'ssl_proxy_port'}; $def{'EXTERNAL_ACL'} = 'external_acl_type unlocked-check ttl=15 negative_ttl=0 %SRC '.$set{'bin_path'}.'access'; $def{'EXTERNAL_REWRITE'} = 'url_rewrite_program '.$set{'bin_path'}.'rewrite'; $def{'UNLOCK_DOMAIN_ACL'} = 'acl unlockdomain dstdomain '.$set{'unlock_domain'}; + $def{'RM_ACCESS_CRONTAB'} = $set{'rm_access_crontab'}.' '.$set{'rm'}.' '.$set{'data_path'}.'access/*'; +$def{'CLEARARCH_CRONTAB'} = $set{'cleararch_crontab'}.' '.$set{'bin_path'}.'cleararch >> '.$set{'log_path'}.'cleararch.log'; +$def{'OLDLOGS_CRONTAB'} = $set{'oldlogs_crontab'}.' '.$set{'bin_path'}.'oldlogs'; + $def{'CC'} = 'CC='.$set{'gcc'}; $def{'CF'} = 'CF='.$set{'c_flags'}; @@ -80,7 +101,9 @@ $def{'OD'} = 'OD='.$set{'bin_path'}; $def{'CM'} = 'CM='.$set{'chmod'}; - +# Now go through input file, find lines to be replaced. Format: +# ###SOME_NAME; +# If found - replace. while ($line = ) { $line =~ s/[\r\n]//g; @@ -91,7 +114,3 @@ while ($line = ) { print "$line\n"; } } - - - - \ No newline at end of file diff --git a/make.sh b/make.sh index ed4980f..f584d54 100644 --- a/make.sh +++ b/make.sh @@ -1,5 +1,9 @@ #!/bin/sh +# This is the script for making the software. Normally, the makefile is used for +# this purpose. But the makefile has to be generated first. +# This script generates the makefile and then uses it and finally removes it. + set -x perl configure.pl settings makefile make diff --git a/makefile.1.mak b/makefile.1.mak index f3ad995..fe4e8cc 100644 --- a/makefile.1.mak +++ b/makefile.1.mak @@ -1,3 +1,5 @@ +# makefile is generated from makefile.1.mak. + ###CC; ###CF; ###PL; @@ -10,8 +12,8 @@ all: moveout copyout remove config.txt -moveout: proxy proxy.pl rewrite access access.pl setuid exec - $(MV) proxy proxy.pl access access.pl rewrite $(OD) +moveout: proxy proxy.pl cleararch oldlogs rewrite access access.pl setuid exec + $(MV) proxy proxy.pl cleararch oldlogs access access.pl rewrite $(OD) copyout: setuid exec # $(CP) access.pl $(OD) @@ -19,8 +21,8 @@ copyout: setuid exec setuid: proxy access $(CM) u+s proxy access -exec: rewrite access.pl proxy.pl - $(CM) +x rewrite access.pl proxy.pl +exec: cleararch oldlogs rewrite access.pl proxy.pl + $(CM) +x cleararch oldlogs rewrite access.pl proxy.pl remove: proxy proxy.c access access.c copyout moveout setuid exec $(RM) proxy.c access.c @@ -48,6 +50,11 @@ access.c: access.1.c configure.pl settings access: access.c $(CC) $(CF) -o access access.c +cleararch: cleararch.1.pl configure.pl settings + $(PL) configure.pl settings cleararch + +oldlogs: oldlogs.1.pl configure.pl settings + $(PL) configure.pl settings oldlogs config.txt: config.1.txt configure.pl settings $(PL) configure.pl settings config.txt diff --git a/oldlogs.1.pl b/oldlogs.1.pl new file mode 100644 index 0000000..65e9497 --- /dev/null +++ b/oldlogs.1.pl @@ -0,0 +1,77 @@ +###PERL; + +# oldlogs is generated from oldlogs.1.pl. +# +# This script renames log files if they are big enough. +# Compresses or removes older log files. + +###LOG_PATH; +###GZIP_PATH; +###LOG_SIZE_LIMIT; +###LOGS_TOTAL; +###LOGS_UNCOMPRESSED; + +if ( opendir ($dir, LOG_PATH)) { + while ($subpath = readdir $dir) { + if ($subpath !~ /\.log$/) { + next; + } + $fullpath=LOG_PATH.$subpath; + unless (-f $fullpath) { + next; + } + unless (@stat = stat($fullpath)) { + next; + } + if ($stat[7] > LOG_SIZE_LIMIT) { + movelog($fullpath,0,0); + } + + } + closedir($dir); +} + +sub movelog { + (my $path, my $number, my $gz) = @_; + my $nextgz = 0; + my $thispath; + my $nextpath; + my $nextnumber=$number+1; + my @gzip_arg = (GZIP_PATH, '-q', '-9','-f'); + + $thispath = $path.(($number != 0)?'.'.$number.($gz?'.gz':''):''); + if ($number == LOGS_TOTAL) { + if (unlink $thispath) { + return 1; + } + else { + return 0; + } + } + if ($number == LOGS_UNCOMPRESSED) { + $nextgz=1; + $nextpath = $path.'.'.$nextnumber.'.gz'; + } + else { + $nextpath = $path.'.'.$nextnumber.($gz?'.gz':''); + } + + if (-e $nextpath) { + unless (movelog($path,$nextnumber,($nextgz or $gz)?1:0)) { + return 0; + } + } + + if ($nextgz) { + push @gzip_arg, $thispath; + unless (! system (@gzip_arg)) { + return 0; + } + $thispath .= '.gz'; + } + + unless (rename ($thispath, $nextpath)) { + return 0; + } + return 1; +} diff --git a/proxy.1.c b/proxy.1.c index edf32c4..85fa4aa 100644 --- a/proxy.1.c +++ b/proxy.1.c @@ -1,4 +1,9 @@ -//The SETUID wrapper. +// proxy.c is generated from proxy.1.c +// +// This is the wrapper for proxy.pl. +// It's run with SETUID to have accesss to some files where the www server +// should not. That's why it has a C wrapper. In modern systems running scripts +// directly with SETUID is considered unsafe and not allowed. #include #include diff --git a/proxy.1.pl b/proxy.1.pl index bf53924..114c55b 100755 --- a/proxy.1.pl +++ b/proxy.1.pl @@ -1,5 +1,38 @@ ###PERL; +# proxy.pl is generated from proxy.1.pl +# +# This is the proxy software. It's launched as a CGI program by the http server. +# +# The proxy: +# - determines information about the http(s) request, +# - checks if user is allowed to use it, +# - forwards the requests using curl, +# - sends the response back to the user, +# - saves a redundant copy of the response. +# +# It's run with SETUID to have accesss to some files where the www server +# should not. That's why it has a C wrapper. In modern systems running scripts +# directly with SETUID is considered unsafe and not allowed. +# When running with SETUID the program also has to deal with Perl's built-in +# securuity. +# +# Any data comming from standard input, files, environment variables, command +# line is treated by Perl as unsafe. Any data created by operations on unsafe +# data is also unsafe. Perl will not allow to use such data to open files (for +# writing), launch programs, etc. +# That's why the data has to be validated and made safe, like this: +# if ($variable =~/^(some-regex)$/) { +# $variable = $1; +# } +# +# see also: +# http://perldoc.perl.org/perlsec.html#Taint-mode +# https://tools.ietf.org/html/rfc2616 +# https://tools.ietf.org/html/rfc3875 +# https://tools.ietf.org/html/rfc2396 + + use POSIX qw(strftime); ###UNLOCK_LOG; @@ -27,10 +60,12 @@ $timeout_inact = TIMEOUT_INACT*60; delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; ###PATH; +# If the special URL is used, instaed of proxying there will be proxy unlocking. if (($ENV{'HTTP_HOST'} =~ UNLOCK_PROXY_HOST) and (urldecode($ENV{'PATH_INFO'}) =~ UNLOCK_PROXY_PATH)){ unlock(); } else { + # proxy will work only if unlocked if(access()) { proxy(); } @@ -39,15 +74,23 @@ else { } } +# The proxying function sub proxy { - # srand(time()+$$); + # The required information is stored in environment variables (rfc3875). + # The HTTPS variable is not defined in rfc3875 (might be a portability + # problem?) - $archive = 0; + $archive = 0; # if this http request should be archived + # The proxy will not support request methods other than GET, POST, HEAD. + # There is no need for it. if ($ENV{'REQUEST_METHOD'} !~ /^(HEAD|GET|POST)$/) { return fail("Status: 405 Method Not Allowed\nAllow: GET, POST, HEAD\n","405 Method Not Allowed","The proxy does not support the $ENV{'REQUEST_METHOD'} method."); } + # The proxy may be presented with either a complete URL (when it's a HTTP + # proxy) or just the path (when it's a HTTPS proxy). + # In this case the URL has to be reconstructed. if ($ENV{'REQUEST_URI'} =~ /^[a-z]*(:[0-9]*)?:\/\/.*$/){ $URL=$ENV{'REQUEST_URI'}; } @@ -55,7 +98,10 @@ sub proxy { $URL=($ENV{'HTTPS'} eq 'on' ? 'https://' : 'http://').$ENV{'HTTP_HOST'}.$ENV{'REQUEST_URI'}; } - if (validateurl($URL)) { #validate URL against illegal characters + # The URL will now be validated. + # Additionally it will be divided in parts which will be stored in variables: + # $prot (protocol), $host, $port, $path, $query. + if (validateurl($URL)) { $URL =~ /^(.*)$/; $URL = $1; } @@ -63,35 +109,45 @@ sub proxy { return fail("Status: 400 Bad Request\n","400 Bad Request","Invalid URL: $URL."); } + # If port not defined default value is used. if ($port eq ''){ $port=($ENV{'HTTPS'} eq 'on' ? '443' : '80'); } - - if ($host =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/) {#IP instead of hostname + + # The host can be a hostname or IP. + # A request to a private IP number gives users access to the proxy's local + # network. This should not be allowed. + if ($host =~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/) { if ($host =~ /^((0*10\..*)|(0*127\..*)|(0*172\.0*((1[6-9])|(2[0-9])|(3[0-1]))\..*)|(0*192\.0*168\..*))$/) { return fail("Status: 403 Forbidden\n","403 Forbidden","The proxy does not accept a private IP address $host as hostname."); } } + # Of course, if there is a web server inside the local network and is accessed + # by its hostname it can be accessible. But the server which runs the proxy + # can have some hostnames defined that point to computers in local network + # that are not available from outside. These should be blocked as well. if ($host =~ BLOCK_HOST) { return fail("Status: 403 Forbidden\n","403 Forbidden","The proxy does not accept $host as hostname."); } + # The proxy will block any access with a port number that is used by the + # proxy. Otherwise it would be possible to create an infinite loop in which + # the request is proxied back to the proxy again. Of course, to create such a + # loop not only the port number but also the host name is required. But is not + # always easy (or possible) to predict if a hostname will point to the proxy + # or not. Blocking port numbers is easier. + #That's why the proxy should not be installed on the default port numbers, 80 + # or 443 because then it will not work! if ($port =~ BLOCK_PORT) { return fail("Status: 403 Forbidden\n","403 Forbidden","The proxy does not accept port number $port because of infinite loop prevention."); } - # do { - # $temppath=TEMP_PATH.int(rand(100000000)).'_'; - # $headpath=$temppath.'h'; - # $downpath=$temppath.'d'; - # $uppath=$temppath.'u'; - # } while (( -e $headpath) or ( -e $downpath) or ( -e $uppath)); - - @curl_arg=(CURL_PATH,'-i','-#'); #-s - @reqhead; + @curl_arg=(CURL_PATH,'-i','-#'); # the command line arguments for curl + @reqhead; # the HTTP request header foreach $envk (keys %ENV) { + # The relevant http variables either start with HTTP_ or CONTENT_. if ($envk =~ /^(HTTP_[A-Z0-9_]+)$/) { $headname=formatheader($1); } @@ -101,12 +157,15 @@ sub proxy { else { next; } + # test against illegal characters if ($ENV{$envk} =~ /^([\x20-\x7e]*)$/) { $headval=$1; } else { next; } + + # Some header fields should not be forwarded. if(lc($headname) =~ REQUEST_HEADER_BLOCK) { next; } @@ -117,6 +176,10 @@ sub proxy { push @curl_arg, $headarg; push @reqhead, $headarg; } + + # curl uses its default values of User-agent and Accept header fields unless + # they are defined in the command line. + # If the original request doesn't contain these fields this will undefine them: unless (exists $ENV{'HTTP_USER_AGENT'}) { push @curl_arg, '-H'; push @curl_arg, 'User-Agent:'; @@ -131,65 +194,45 @@ sub proxy { } if ($ENV{'REQUEST_METHOD'} eq 'POST'){ - # binmode(STDIN) or return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to switch STDIN to binary mode."); - # open($upfile,">",$uppath) or return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to create temporary file."); - # binmode ($upfile) or return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to switch temporary file to binary mode."); - - # $buffer; - # while (read (STDIN,$buffer,65536)) { - # unless (print ($upfile $buffer)) { - # return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to write to temporary file."); - # } - # } - # close ($upfile); - # $craetedup=1; - - $postdata=1; + $postdata=1; # if there is data to pe posted push @curl_arg, '--data-binary'; - # push @curl_arg, '@'.$uppath; push @curl_arg, '@-'; - } - $archpath = makepath($prot, $host, $port, $path); + # The path for archiving the HTTP request is made, the required directories + # are created. + # If this fails the request will not be archived but will still be proxied. + $archpath = makepath($prot, $host, $port, $path, $query); if ($archpath) { $archive = 1; - if ($query or $postdata) { + if ($postdata) { + # If this is a POST request the URL is not enough to identify it. To the + # path will be added a number. Also the request header and post data will + # be archived and not only the response header and response data. for (my $i = 0; ;++$i) { $respcontpath=$archpath.'@v-'.$i; $respheadpath=$archpath.'@h-'.$i; $postheadpath=$archpath.'@g-'.$i; $postcontpath=$archpath.'@u-'.$i; - $postquerpath=$archpath.'@q-'.$i; - if ((! -e $respcontpath) and (! -e $respheadpath) and (! -e $postheadpath) and (! -e $postcontpath) and (! -e $postquerpath)) { + if ((! -e $respcontpath) and (! -e $respheadpath) and (! -e $postheadpath) and (! -e $postcontpath)) { last; } } - if ($query) { - if (open ($postquerfile, ">", $postquerpath)) { - print $postquerfile "$query\n"; - close($postquerfile); - } - } - - if ($postdata) { - if (open ($postheadfile, ">", $postheadpath)) { - foreach $line (@reqhead) { - print $postheadfile "$line\n"; - } + # The request header is written. + if (open ($postheadfile, ">", $postheadpath)) { + foreach $line (@reqhead) { print $postheadfile "$line\n"; - close ($postheadfile); } + print $postheadfile "$line\n"; + close ($postheadfile); } + } else { $respcontpath=$archpath.'@v'; $respheadpath=$archpath.'@h'; - $postheadpath=''; - $postcontpath=''; - $postquerpath=''; } } else { @@ -198,12 +241,14 @@ sub proxy { push @curl_arg, $URL; + # The pipes for communication with curl are created. pipe ($curloutrd, $curloutwr) or return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to create stdout pipe."); pipe ($curlerrrd, $curlerrwr) or return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to create stderr pipe."); if ($postdata){ pipe ($curlinrd, $curlinwr) or return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","Failed to create stdin pipe."); } + # curl, the program that will send the forwarded request is launched here $curlpid = fork(); unless (defined $curlpid) { return fail("Status: 500 Internal Server Error\n","500 Internal Server Error","fork() failed."); @@ -212,6 +257,7 @@ sub proxy { close ($curloutrd); close ($curlerrrd); + # The input and outputs of curl are redirected to pipes. open(STDERR, ">&=" . fileno($curlerrwr)) or die "Failed to open stderr pipe.\n"; open(STDOUT, ">&=" . fileno($curloutwr)) or die "Failed to open stdout pipe.\n"; @@ -228,11 +274,19 @@ sub proxy { close ($curlerrwr); if($postdata) { + # If this is a POST request the post data is available on the standard + # input. It will be piped to curl and written to a file (if successfully + # opened). + # Originally this part would be in a separate thread but: + # 1. It didn't work. Not sure why. + # 2. It was not necaessary. + # $curlpidin = fork(); # unless (defined $curlpidin) { # $forceexit=1; # } # unless ($curlpidin) { + close($curlinrd); binmode($curlinwr) or die "Failed to switch stdin pipe to binary mode.\n"; binmode(STDIN) or die "Failed to switch STDIN to binary mode.\n"; @@ -259,36 +313,49 @@ sub proxy { if ($postcontopen) { close($postcontfile); } + # exit 0; # } # close ($curinwr); # close ($curlinrd); } - $safelength=1; - $definedlength=0; - $safeerror=1; - $lastskipped=1; - $forceexit=0; + $safelength=1; # if Content-length is safe to be forwarded + $definedlength=0; # if Content-length is defined + $safeerror=1; # if it's safe to send an error message. + $lastskipped=1; # if the previous header field was not forwarded + $forceexit=0; # if the proxy should exit with error if ($line = <$curloutrd>) { $line =~ s/\r?\n$//; + # The first line is the status. if ($line =~ /^HTTP\/[0-9]+\.[0-9]+ ([0-9]+ .*)$/) { $status =$1; @resphead=('Status: '.$status); + # If the status is not 2XX and this URL was already archived before it + # should not be overwritten. + # Otherwise it can happen that, for example, after refreshing the content + # previously downloaded will be overwritten with a "304 Not Modified" and + # lost. + if(($status !~ /^2[0-9][0-9]/) and (-e $respcontpath)){ $archive=0; } + # now read the header lines while ($line = <$curloutrd>) { $line =~ s/[\r\n]$//g; + # empty line (double \n) meane the end of the header. if($line eq ''){ last; } + # If line starts with space or tab, it's a continuation of the previous + # line. if($line =~ /^[ \t](.*)$/){ + # Continuation of a skipped line should be skipped too. if ($lastskipped) { next; } @@ -308,6 +375,9 @@ sub proxy { next; } + # If one of these header fields is present it can not be assumed that + # the received value of Content-length is also correct for the forwarded + # content. if($headname eq 'transfer-encoding') { $safelength = 0; } @@ -322,6 +392,8 @@ sub proxy { $headvalue =~ s/[\r\n \t]*$//; $contentlength = int($headvalue); } + + # Some header fields should not be forwarded. if($headname =~ RESPONSE_HEADER_BLOCK) { $lastskipped =1; next; @@ -331,12 +403,16 @@ sub proxy { push @resphead, $line; } - if ($lengthdefined and ($lengthsafe or ($contentlength==0))) { + # Only add the Content-length header if it's safe to do so. + if ($lengthdefined and ($safelength or ($contentlength==0))) { push @resphead, 'Content-length: '.$contentlength; } + # After forwarding the response header it's too late to send any error + # messages. $safeerror=0; + # Forward the header and safe it to file if possible and allowed. if ($archive) { $respheadopen = open($respheadfile, ">", $respheadpath); } @@ -352,15 +428,16 @@ sub proxy { close $respheadfile; } + # Now forward the actual content and safe it to file if possible and + # allowed. if ( ! binmode ($curloutrd)) { - # print "FAIL 1\n"; $forceexit=1; } elsif ( ! binmode (STDOUT) ) { - # print "FAIL 2\n"; $forceexit=1; } else { + # Create the file only if there was content received. No 0-byte files. $firstline=1; while (read ($curloutrd,$buffer,1024)) { unless (print (STDOUT $buffer)) { @@ -392,9 +469,11 @@ sub proxy { } } + # Ok, done. Now wait for curl to exit. (Should have already happened). waitpid($curlpid,0); if ($? != 0) { + # Generate error message if possible. if ($safeerror) { $errormsg=''; while ($line = <$curlerrrd>) { @@ -416,6 +495,7 @@ sub proxy { # } # } + # Send error message if it;s safe to do so. if($safeerror){ return fail("Status: 500 Internal Server Error\n","500 Internal Server Error",$errormsg); } @@ -425,31 +505,44 @@ sub proxy { } -sub access { #kind of doubles the functionality of access.pl but for http +# This function checks if the user has unlocked the proxy. Nonzero means yes. +sub access { if($ENV{'HTTP_HOST'} =~ UNLOCK_PROXY_HOST){ return 1; } + + # Check if IP if ($ENV{'REMOTE_ADDR'} =~ /^([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)$/) { $IP = $1; + + # 127.0.0.1 is always allowed. All https requests are tunnelled through + # 127.0.0.1. if ($IP eq '127.0.0.1') { return 1; } + + # If access file doesn't exist the proxy is not unlocked for this IP. $accesspath=ACCESS_PATH.$IP; if (! (-e $accesspath)) { return 0; } elsif (open ($accessfile,"+<",$accesspath)) { + # The access file has to be locked otherwise one process may read it while + # another one updates it. unless (flock ($accessfile, 2)) { close ($accessfile); return 0; } + # First line is the timestamp of proxy unlocking. Second line is the + # timestamp of last access. $unlocktime=<$accessfile>; $lasttime=<$accessfile>; $unlocktime =~ s/[\r\n]//g; $lasttime =~ s/[\r\n]//g; + # The timestamps must be numbers. if ($unlocktime =~ /^([0-9]+)$/) { $unlocktime=int($1); } @@ -467,11 +560,14 @@ sub access { #kind of doubles the functionality of access.pl but for http } if ((abs($accesstime-$unlocktime)>$timeout_unlock) or (abs($accesstime-$lasttime)>$timeout_inact)){ + # If too much time passed proxy is not unlocked any more. Also the + # access file is now useless and will be removed. close ($accessfile); unlink $accesspath; return 0; } else { + # the proxy is unlocked. The access file will now be updated. if (seek($accessfile, 0, 0)) { print $accessfile "$unlocktime\n$accesstime\n"; truncate ($accessfile , tell($accessfile)); @@ -489,15 +585,21 @@ sub access { #kind of doubles the functionality of access.pl but for http } } +# Function to unlock the proxy. sub unlock { if ($ENV{'REMOTE_ADDR'} =~ /^([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)$/) { $IP=$1; } + # If the request method is GET the CGI parameters must be taken from the + # query string (part of URL). if ($ENV{'REQUEST_METHOD'} eq 'GET') { %CGI=getcgi($ENV{'QUERY_STRING'}); } + # If request method is POST the CGI parameters must be taken from standard + # input. elsif ($ENV{'REQUEST_METHOD'} eq 'POST'){ + #Check content-type. Multipart is not supported. if ($ENV{'CONTENT_TYPE'} eq 'application/x-www-form-urlencoded'){ %CGI=getcgi( ); } @@ -509,12 +611,13 @@ sub unlock { return unlockpage("Unsupported method: $ENV{'REQUEST_METHOD'}.","Status: 405 Method Not Allowed\nAllow: GET, POST\n"); } + # If there was an IP in the form it must be used instead. if ($CGI{'ip'} =~ /^([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)$/) { $IP=$1; } if ($IP eq '') { - return unlockpage("$Invalid IP.","Status: 403 Forbidden\n"); + return unlockpage("Invalid IP.","Status: 403 Forbidden\n"); } if ($CGI{'username'} eq ''){ @@ -530,21 +633,25 @@ sub unlock { } $passpath = PASS_PATH.$CGI{'username'}; - + # Open file to read user's password. open($passfile, "<", $passpath) or return unlockpage('Wrong username or password.',"Status: 403 Forbidden\n"); $pass = <$passfile>; close($passfile); $pass =~ s/[\r\n]//g; + # The password is stored in URL-encoded form. $pass = urldecode($pass); if ($pass ne $CGI{'password'}){ return unlockpage('Wrong username or password.',"Status: 403 Forbidden\n"); } + # All unlocking access must be logged: time, IP, user. The server owner must + # be able to know who did what with the proxy (in case of legal problems). open ($logfile, ">>", UNLOCK_LOG) or return unlockpage("Couldn't log your action.","Status: 500 Internal Server Error\n"); print $logfile strftime("%d.%m.%Y %H:%M:%S", gmtime($accesstime))." $IP $CGI{'username'}\n"; close($logfile); + # Ok, unlocked. Create access file. $accesspath=ACCESS_PATH.$IP; open ($accessfile,">",$accesspath) or return unlockpage("$accesspath","Status: 403 Forbidden\n"); print $accessfile "$accesstime\n$accesstime\n"; @@ -554,6 +661,8 @@ sub unlock { } +# The function to get CGI parameters from string. +# Format is: name=url_encoded_value&name=url_encoded_value& ... &name=url_encoded_value sub getcgi { my $arg; my $varl; @@ -568,6 +677,7 @@ sub getcgi { return %cgi; } +# Function for decoding URL-encoded text sub urldecode { my $t = $_[0]; $t =~ s/\+/ /g; @@ -575,6 +685,11 @@ sub urldecode { return $t; } +# Function for formatting header field names. +# The Environment variable name is like this: HTTP_HEADER_NAME or HEADER_NAME. +# But should be Header-Name. +# Actually the names are case-insensitive but in practise they often look like +# this. (Otherwise suspicious?) sub formatheader { my $t = $_[0]; $t =~ s/^HTTP_//; @@ -584,6 +699,9 @@ sub formatheader { return $t; } +# Function for showing the proxy unlock form page. +# arguments: 1 - additional message (optional), 2 - additional header fields +# (optional) sub unlockpage { (my $message, my $header)=@_; if($header ne ''){ @@ -616,6 +734,8 @@ sub unlockpage { print "\n"; } +# Function for showing error messages. +# Arguments: 1 - header fields, 2 - page title, 3 - error message text. sub fail { (my $header, my $title, my $message)=@_; if($header ne ''){ @@ -638,6 +758,7 @@ sub fail { print "\n"; } +# Function to show the message after unlocking the proxy. sub unlockedpage { print "Content-type: text/html\n\n"; print ''; @@ -654,6 +775,7 @@ sub unlockedpage { print "\n"; } +# Function to show the message when proxy not unlocked. sub noaccess { print "Status: 403 Forbidden\n;"; print "Content-type: text/html\n\n"; @@ -669,6 +791,7 @@ sub noaccess { print "\n"; } +# Function to show debug information. Not used any more. sub debag { print "Content-type: text/plain\n\n"; foreach $envk (keys %ENV) { @@ -690,6 +813,8 @@ sub debag { print 'URL: ',$URL,"\n"; } +# Function to check URL and divide in parts: protocol, hostname, port number, +# path, query string. sub validateurl { my $url = $_[0]; $prot; @@ -761,12 +886,15 @@ sub validateurl { return 1; } +# Function to convert URL to archive path. Also creates required directories. +# Returns the path on success and empty string on fail. +# Inspired by the MIRA browser. sub makepath { - #inspired by the MIRA browser! - (my $prot, my $host, my $port, my $path) = @_; + (my $prot, my $host, my $port, my $path, my $query) = @_; my $archpath = ARCH_PATH; + # First subdir: protocol @p port_number if ($prot =~ /^(https?)$/) { $archpath .= $1; } @@ -785,6 +913,18 @@ sub makepath { } } + # Host name: each segment between dots is a subdir, revedrsed order, last part + # ends with @n. + # For example www.example.com becomes + # ... /com/example/www@/ ... + + # When segment is longer than 120 characters it's divided into 64 character + # parts ending with '-' until the last one is not longer than 120. + # For example + # www.aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmmmmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz.com + # becomes + # ... /com/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggg-/gggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmm-/mmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz/www@n/ ... + if($host =~ /^([A-Za-z0-9\-\.]+)$/) { $host = $1; } @@ -832,13 +972,30 @@ sub makepath { } } + # Path: each segment between '/' is subdir. The segment after the last '/' is + # not a subdir - it is part of the file name. And it will be appended with + # some characters outside of this function: @g, @h, @u or @v. + # Exception: when there is a query string the last segment is a subdir too and + # ends with q. + # Characters that are not letters or numbers or '_' or '.' are encoded. Like + # in URL-encoding but with '@' instead of '%'. + # When segment is longer than 120 characters it's divided into 64 character + # parts ending with '-' until the last one is not longer than 120. + + # For example: + # /some/path/?a=1 becomes ... /some/path@q/ ... + # /some/other-path becomes ... /some/other@2Dpath + # /path/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmmmmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz/yyy + # becomes + # ... /path/aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggg-/gggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmm-/mmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz/yyy + $path =~ s/^\///; while ((my $ind = index ($path, '/'))>=0) { my $part = substr $path, 0, $ind; $path= substr $path, $ind+1; - $part =~ s/[^A-Za-z0-9_\.]/sprintf ("@%02X",ord($1))/eg; + $part =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; while (length ($part) > 120) { $archpath .= '/'.substr($part,0,64).'-'; @@ -858,7 +1015,7 @@ sub makepath { } } } - $path =~ s/[^A-Za-z0-9_\.]/sprintf ("@%02X",ord($1))/eg; + $path =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; while (length ($path) > 120) { $archpath .= '/'.substr($path,0,64).'-'; $path = substr($path,64); @@ -869,6 +1026,46 @@ sub makepath { } } } + + if ($query) { + # query string: The last part is not a subdir - it is part of the file name. + # And it will be appended with some characters outside of this function: @g, + # @h, @u or @v. + # When Query string is longer than 240 characters it's divided into 128 + # character parts ending with '-' until the last one is not longer than 240. + # Characters that are not letters or numbers or '_' or '.' are encoded. Like + # in URL-encoding but with '@' instead of '%'. + # When segment is longer than 120 characters it's divided into 64 character + # parts ending with '-' until the last one is not longer than 120. + # The '?' at the beginning is not part of the query string. + # For example: + # ?a=1&b=%25%5E%26 becomes ... /a@3D1@26b@3D@2525@255E@2526 + # ?a=aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmmmmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz&b=aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmmmmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz + # becomes + # ... /a@3Daaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmm-/mmmmmmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz@26@b@3Daaaaaaaaaabbbb-/bbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffgggggggggghhhhhhhhhhiiiiiiiiiijjjjjjjjjjkkkkkkkkkkllllllllllmmmmmmmmmmnnnnnnnnnnoooooooooopppppppppprrrrrrrrrrssssssssssttttttttttuuuuuuuuuuwwwwwwwwwwyyyyyyyyyyzzzzzzzzzz + $query =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; + + $archpath .= '/'.$path.'@q'; + + $path = $query; + unless (-d $archpath) + { + unless (mkdir $archpath) { + return ''; + } + } + + while (length ($path) > 240) { + $archpath .= '/'.substr($path,0,128).'-'; + $path = substr($path,128); + unless (-d $archpath) + { + unless (mkdir $archpath) { + return ''; + } + } + } + } $archpath .= '/'.$path; return $archpath; } diff --git a/rewrite.1.pl b/rewrite.1.pl index 1578162..1ffe9ed 100755 --- a/rewrite.1.pl +++ b/rewrite.1.pl @@ -1,5 +1,13 @@ ###PERL; +# rewrite is generated from rewrite.1.pl +# +# This is the Squid helper program used for redirection. It always redirects to +# the https proxy. +# +# see also: +# http://wiki.squid-cache.org/Features/AddonHelpers#URL_Re-Writing_.28Mangling.29 + ###REWRITE_URL; $|=1; diff --git a/settings b/settings index 09502af..fb844a8 100644 --- a/settings +++ b/settings @@ -20,10 +20,11 @@ server_admin = bicyclesonthemoon@chirpingmustard.info # No matter what key you use there will be ALWAYS an unavoidable certifficate # mismatch warning. Because the proxy does an equivalent to a MITM attack. -ssl_key = /etc/apache2/ssl/botm.key -ssl_cert = /etc/apache2/ssl/botm.crt +ssl_key = /etc/apache2/ssl/proxy.key +ssl_cert = /etc/apache2/ssl/proxy.crt -#doesn't have to be a real domain +# The domain and path used for proxy unlocking +# doesn't have to be a real domain unlock_domain = yplom.bicyclesonthemoon.info unlock_path = /proxy/unlock unlock_domain_regex = ^yplom\.bicyclesonthemoon\.info(:[0-9]*)?$ @@ -37,6 +38,9 @@ block_host_regex = ^(localhost|(botcastle[0-9]*))$ timeout_unlock = 90 timeout_inact = 15 +#Time in seconds +timeout_arch = 172800 # how old files must be to safely remove them + path = /usr/local/bin:/usr/bin:/bin perl = /usr/bin/perl curl = /usr/bin/curl @@ -45,6 +49,14 @@ cp = /bin/cp mv = /bin/mv rm = /bin/rm gcc = /usr/bin/gcc +gzip = /bin/gzip c_flags = -g -Wall -rm_access_crontab = 0 0 * * * #How often to remove leftover unlock info. +log_size_limit = 65536 # How big can a log file be +logs_uncompressed = 2 # How many uncompressed old logs to keep +logs_total = 10 # How many old logs to keep + +rm_access_crontab = 0 1 * * * # How often to remove leftover unlock info. +cleararch_crontab = 30 0 * * * # How often to clear the archive from old files. +oldlogs_crontab = 0 0 * * * # How often to deal with old logs +# Not a good idea to launch oldlogs just after cleararch, I think.