From 66594db75b45dc0800fe7ef82839d8efaa4cf5ab Mon Sep 17 00:00:00 2001
From: b <b@7dec801f-c475-4e67-ba99-809552d69c55>
Date: Wed, 16 Dec 2015 20:54:14 +0000
Subject: [PATCH] readheaderfile() moved to proxy library. can process all 3
 types of facebook pages now.

git-svn-id: svn://botcastle1b/yplom/facebug1@8 7dec801f-c475-4e67-ba99-809552d69c55
---
 bot.1.pl | 262 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 155 insertions(+), 107 deletions(-)

diff --git a/bot.1.pl b/bot.1.pl
index 82f50ae..de81ef3 100644
--- a/bot.1.pl
+++ b/bot.1.pl
@@ -4,7 +4,7 @@ use strict;
 use Fcntl;
 use File::Copy;
 ###PROXY_LIB;
-use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode urldecode);
+use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode urldecode readheaderfile);
 use POSIX qw(strftime);
 
 ###ARCH_PATH;
@@ -64,18 +64,24 @@ sub processgroup {
 		return;
 	}
 	
-	$archpath = url2path('https://m.facebook.com/groups/'.$groupid).'@q/';
-	
 	print "Group $groupid\n";
 	
-	processdir($archpath,$groupid,\%settings,\%names);
+	$archpath = url2path('https://m.facebook.com/groups/'.$groupid);
+	processdir($archpath.'@q/',\%settings,\%names,0);
+	processdir($archpath.'/@q/',\%settings,\%names,0);
+	
+	$archpath = url2path('https://m.facebook.com/comment/replies');
+	processdir($archpath.'@q/',\%settings,\%names,1);
+	processdir($archpath.'/@q/',\%settings,\%names,1);
+	
 }
 
 sub processdir {
-	(my $dirpath, my $groupid, my $settings, my $names) = @_;
+	(my $dirpath, my $settings, my $names, my $pagemode) = @_;
 	my $dir;
 	my $subpath;
 	my $subpathfull;
+	print "Dir $dirpath\n";
 	
 	unless ( opendir ($dir, $dirpath)) {
 		return;
@@ -86,17 +92,17 @@ sub processdir {
 			next;
 		}
 		if (-f $subpathfull) {
-			processfile($subpathfull, $groupid, $settings, $names);
+			processfile($subpathfull, $settings, $names, $pagemode);
 		}
 		elsif (-d $subpathfull) {
-			processdir($subpathfull.'/', $groupid, $settings, $names);
+			processdir($subpathfull.'/', $settings, $names, $pagemode);
 		}
 	}
 	closedir ($dir);	
 }
 
 sub processfile {
-	(my $headerpath, my $groupid, my $settings, my $names) = @_;
+	(my $headerpath, my $settings, my $names, my $pagemode) = @_;
 	my $basepath;
 	my $contentpath;
 	my $contentfile;
@@ -111,7 +117,9 @@ sub processfile {
 	
 	my %cgi;
 	
-	my $id;
+	my $postid;
+	my $threadid;
+	my $groupid=0; ###!
 	my $timenumber;
 	
 	my %thread;
@@ -137,18 +145,38 @@ sub processfile {
 	### REDESIGN THE CONDITIONS!
 	if($query ne '') {
 		%cgi=getcgi($query);
-		$id = $cgi{'id'};
-		if ($id =~ /^[0-9]+$/) {
-			$pagetype='thread';
+		if($pagemode) {
+			$postid = $cgi{'ctoken'};
+			if ($postid =~ /^([0-9]+)_([0-9]+)$/) {
+				$threadid = $1;
+				$postid = $2;
+				$pagetype = 'post';
+			}
+			else {
+				return;
+			}
 		}
-		else {
-			$pagetype = 'group';
+		else{
+			$threadid = $cgi{'id'};
+			if ($threadid =~ /^([0-9]+)$/) {
+				$threadid = $1;
+				$pagetype='thread';
+			}
+			else {
+				$pagetype = 'group';
+			}
 		}
 	}
 	else {
-		$id='';
-		$pagetype = 'group';
+		if($pagemode) {
+			return;
+		}
+		else {
+			$threadid='';
+			$pagetype = 'group';
+		}
 	}
+	print " type=$pagetype\n";
 	
 	for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
 		%header = readheaderfile($headerpath); 
@@ -183,7 +211,7 @@ sub processfile {
 		}
 	}
 	
-	# REDESIGN THE CONDITIONS!
+	# This condition is redundant now.
 	if ($pagetype) {
 		
 		
@@ -203,18 +231,25 @@ sub processfile {
 		my $hidename;
 		my $attnumber;
 		my $incomplete;
+		my $firstpost;
 		
 		if ($pagetype eq 'thread') {
-			print "Thread $id\n";
+			print "Thread $threadid\n";
 			
-			$thread{'id'}=$id;
-			$thread{'groupid'}=$groupid;
+			$thread{'id'}=$threadid;
+			$thread{'groupid'}=$$settings{'id'};
 			$thread{'timenumber'}=$timenumber;
 			$mode = 'thread';
 			$level=0;
 			$attnumber=0;
 			$incomplete=0;
 		}
+		elsif ($pagetype eq 'post'){
+			print "Post $postid ($threadid)\n"; 
+			
+			$mode='posts';
+			$firstpost=1;
+		}
 		else { #group
 			print "Threads\n";
 			$mode = 'threads';
@@ -227,11 +262,11 @@ sub processfile {
 			close($contentfile);
 			return;
 		}
-		while () {
+		while ($mode ne '') {
 			if ($closetag){
 				$tag{'<'} = '/'.$tag{'<'};
 				$tag{'/'}='/';
-				$tag{"\\"}=undef;
+				delete $tag{"\\"};
 				$closetag=0;
 			}
 			else {
@@ -254,7 +289,7 @@ sub processfile {
 					print "Thread [$1]\n";
 					$mode = 'thread';
 					%thread = ();
-					$thread{'groupid'}=$groupid;
+					$thread{'groupid'}=$$settings{'id'};
 					$thread{'timenumber2'}=$timenumber;
 					$level = 0;
 					$attnumber=0;
@@ -577,14 +612,39 @@ sub processfile {
 					%post = ();
 					
 					$post{'id'} = $1;
-					$post{'threadid'} = $id;
+					$post{'threadid'} = $threadid;
 					$post{'groupid'} = $$settings{'id'};
 					$post{'timenumber'} = $timenumber;
 					
 					$mode = 'post';
 					$level=0;
 					$attnumber=0;
-					print "Post $post{'id'}\n";
+					
+					if($pagetype eq 'post') {
+						if(!$groupid) {
+							print "Can't determine if post belongs to group $$settings{'id'}.\n";
+							$mode='';
+							last;
+						}
+						elsif($post{'id'} eq $postid) {
+							$firstpost=1;
+						}
+						else {
+							$firstpost=0;
+							$post{'postid'} = $postid;
+						}
+					}
+					print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n";
+				}
+				elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) {
+					if ($1 eq $$settings{'id'}) {
+						$groupid = 1;
+					}
+					else {
+						print "Post does not belong to group $$settings{'id'}.\n";
+						$mode = '';
+						last;
+					}
 				}
 			}
 			
@@ -621,18 +681,32 @@ sub processfile {
 								print "Can't mkdir $postpath.\n";
 							}
 						}
-						$postpath.='post/';
+						if(($pagemode eq 'post')and !$firstpost){
+							$postpath.='postreply/';
+						}
+						else {
+							$postpath.='post/';
+						}
 						unless (-d $postpath) {
 							unless (mkdir $postpath) {
 								print "Can't mkdir $postpath.\n";
 							}
 						}
-						$postpath.=$thread{'id'}.'/';
+						$postpath.=$post{'threadid'}.'/';
 						unless (-d $postpath) {
 							unless (mkdir $postpath) {
 								print "Can't mkdir $postpath.\n";
 							}
 						}
+						if(($pagemode eq 'post')and !$firstpost){
+							$postpath.=$post{'postid'}.'/';
+							unless (-d $postpath) {
+								unless (mkdir $postpath) {
+									print "Can't mkdir $postpath.\n";
+								}
+							}
+						}
+						
 						$postpath.=$post{'id'};
 						
 						if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
@@ -745,6 +819,9 @@ sub processfile {
 						$mode = 'post';
 					}
 				}
+				elsif ($tag{'<'} eq 'br') {
+					$post{'content'}.='<br>';
+				}
 				elsif ($tag{'<'} eq 'p') {
 					$post{'content'}.='<p>';
 				}
@@ -875,112 +952,83 @@ sub processfile {
 	}
 }
 
-sub readheaderfile {
-	(my $headerpath) = @_;
-	my $headerfile;
-	my %header;
-	
-	if(ref($headerpath)) {
-		$headerfile=$headerpath;
-	}
-	else {
-		unless (open ($headerfile, "<", $headerpath)) {
-			return %header;
-		}
-	}
-	
-	my $lastname='';
-	
-	while (defined(my $line = <$headerfile>)) {
-		$line =~ s/[\r\n]$//g;
-		# print">> $line <<\n";
-		my $headname='';
-		my $headval='';
-		
-		if($line =~ /^[ \t]+([^ \t](.*[^ \t])?)[ \t]*$/){
-			if($lastname ne '') {
-				$header{$lastname}.=$1;
-			}
-		}
-		elsif ($line =~ /^([^:]+):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
-			$headname = lc($1);
-			$headval = $2;
-			
-			if ($header{$headname} ne '') {
-				$header{$headname}.=', '.$headval;
-			}
-			else {
-				$header{$headname}=$headval;
-			}
-			$lastname = $headname;
-		}
-	}
-	
-	unless (ref($headerpath)) {
-		close ($headerfile);
-	}
-	
-	return %header;
-}
-
-# Very similar to header file reading.
+# Function to read data from datafiles.
+# Very similar to http header file reading. (function readheaderfile() in proxy
+# library)
+# 
 # Differences:
 #
-# After field name and colon there must be exactly one whitespace (space or
+# 1. After field name and colon there must be exactly one whitespace (space or
 # tab). Any other leading or trailing whitespace (but not the newline character
 # at the end of the line) is treated as part of the field value.
 #
-# When header field is split into multiple lines the next lines must start with
-# exactly one whitespace (tab or space) Any other leading or trailing whitespace
-# (but not the newline character at the end of the line) is treated as part of
-# the field value. the lines will be joined with a newline between them.
+# 2. When header field is split into multiple lines the next lines must start
+# with exactly one whitespace (tab or space) Any other leading or trailing
+# whitespace (but not the newline character at the end of the line) is treated
+# as part of the field value. the lines will be joined with a newline between
+# them.
+#
+# 3. When the same field name appears it replaces the previous one.
+# 
+# 4. Line separator is LF and not CR LF. The CR character is treated as part of
+# the field value.
 #
-# When the same field name appears it replaces the previous one.
+# Returns a hash containing the values.
+# Names are case sensitive and are converted to lowercase
+#
+# Argument can be a path or a file handle. In case of a file handle it will just
+# read the file. In case of path it opens the file before reading and closes
+# after. On failure (file not open) returns empty hash.
+# 
 sub readdatafile {
-	(my $headerpath) = @_;
-	my $headerfile;
-	my %header;
+	(my $datapath) = @_;
+	my $datafile;
+	my %data;
 	
-	if(ref($headerpath)) {
-		$headerfile=$headerpath;
+	# check if $datapath is actually a path or maybe a filehandle
+	# filehandles are references.
+	if(ref($datapath)) {
+		$datafile=$datapath;
 	}
 	else {
-		unless (open ($headerfile, "<", $headerpath)) {
-			return %header;
+		unless (open ($datafile, "<", $datapath)) {
+			return %data;
 		}
 	}
-	
+
+	# The name of header field in previous line. Required for header fields that
+	# occupy multiple lines.
 	my $lastname='';
 	
-	while (defined(my $line = <$headerfile>)) {
-		$line =~ s/[\r\n]$//g;
-		my $headname='';
-		my $headval='';
+	while (defined(my $line = <$datafile>)) {
+		$line =~ s/[\n]$//g;
+		my $name='';
+		my $value='';
 		
+		# Line starts with whitespace. It's a continuation of the previous line.
+		# Concatenate the field value, separated by newline.
 		if($line =~ /^[ \t](.*)$/){
 			if($lastname ne '') {
-				$header{$lastname}.="\n".$1;
+				$data{$lastname}.="\n".$1;
 			}
 		}
+		# Line starts with a name followed by colon. Save the value
 		elsif ($line =~ /^([^:]+):[ \t](.*)$/) {
-			$headname = lc($1);
-			$headval = $2;
+			$name = lc($1);
+			$value = $2;
 			
-			# if ($header{$headname} ne '') {
-				# $header{$headname}.=$headval;
-			# }
-			# else {
-				$header{$headname}=$headval;
-			# }
-			$lastname = $headname;
+			$data{$name}=$value;
+			
+			$lastname = $name;
 		}
 	}
 	
-	unless (ref($headerpath)) {
-		close ($headerfile);
+	# If argument was a path the file must be closed. 
+	unless (ref($datapath)) {
+		close ($datafile);
 	}
 	
-	return %header;
+	return %data;
 }
 
 sub writedatafile {
-- 
2.30.2