From: b <b@7dec801f-c475-4e67-ba99-809552d69c55>
Date: Wed, 9 Dec 2015 11:47:32 +0000 (+0000)
Subject: Saving nextposts and images.
X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=79ab006c62c1591a923c2e08960b53e1df575d98;p=yplom%2Ffacebug1

Saving nextposts and images.



git-svn-id: svn://botcastle1b/yplom/facebug1@6 7dec801f-c475-4e67-ba99-809552d69c55
---

diff --git a/bot.1.pl b/bot.1.pl
index eb9f75c..61b3a77 100644
--- a/bot.1.pl
+++ b/bot.1.pl
@@ -117,6 +117,9 @@ sub processfile {
 	my %thread;
 	my %thread2;
 	
+	my %post;
+	my %post2;
+	
 	if ($headerpath =~ /^((.+)\@h)$/) {
 		$headerpath = $1;
 		$basepath = $2;
@@ -190,6 +193,7 @@ sub processfile {
 		my %tag;
 		my $mode = 'thread';
 		my $level = 0;
+		my $level2;
 		my $closetag=0;
 		my $ignoretext;
 		my $link;
@@ -229,6 +233,7 @@ sub processfile {
 					$level=0;
 					$ignoretext=1;
 					$hidename=0;
+					$link=0;
 				}
 				elsif ($tag{'<'} eq 'abbr') {
 					$mode = 'thread-time';
@@ -243,18 +248,34 @@ sub processfile {
 					$mode='posts';
 					
 					my $threadfile;
-					my $threadpath = ARCH_PATH.$$settings{'id'}.'/'.'thread/'.$thread{'id'};
+					my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
+					unless (-d $threadpath) {
+						unless (mkdir $threadpath) {
+							print "Can't mkdir $threadpath.\n";
+						}
+					}
+					$threadpath.='thread/';
+					unless (-d $threadpath) {
+						unless (mkdir $threadpath) {
+							print "Can't mkdir $threadpath.\n";
+						}
+					}
+					$threadpath.=$thread{'id'};
 					
 					# if (open ($threadfile, "+<", $threadpath)) {
 					if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) {
 						if (flock ($threadfile, 2)) {
-							# In the future it might be necessary to use a different function.
-							%thread2 = readheaderfile($threadfile);
+							%thread2 = readdatafile($threadfile);
 							
 							if (($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'})) {
 								print ("Newer version already saved.\n\n");
 							}
 							else {
+								foreach my $ind (keys %thread2) {
+									if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
+										delete $thread2{$ind};
+									}
+								}
 								foreach my $ind (keys %thread) {
 									$thread2{$ind}=$thread{$ind};
 								}
@@ -263,16 +284,16 @@ sub processfile {
 								}
 								
 								if (seek($threadfile, 0, 0)) {
+									writedatafile($threadfile,%thread2);
+									truncate ($threadfile , tell($threadfile));
+									
 									foreach my $ind (keys %thread2) {
-										print $threadfile "$ind: $thread2{$ind}\n";
 										print "$ind: $thread2{$ind}\n"; ####
 									}
-									print $threadfile "\n";
-									truncate ($threadfile , tell($threadfile));
 									print "saved.\n\n";
 								}
 								else {
-									print "Failed seek $threadfile.\n\n";
+									print "Can't seek $threadfile.\n\n";
 								}
 							}
 						}
@@ -287,6 +308,7 @@ sub processfile {
 					}
 				}
 			}
+			
 			elsif ($mode eq 'thread-author') {
 				if ($tag{'<'} eq 'a') {
 					if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
@@ -301,6 +323,7 @@ sub processfile {
 					$mode='thread';
 				}
 			}
+			
 			elsif ($mode eq 'thread-content') {
 				if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
 					++$level;
@@ -309,7 +332,7 @@ sub processfile {
 				elsif ($tag{'<'} eq '/div') {
 					if($level){
 						--$level;
-						$thread{'postcontent'}.='<div>';
+						$thread{'postcontent'}.='</div>';
 					}
 					else {
 						$mode = 'thread';
@@ -348,6 +371,7 @@ sub processfile {
 				}
 				
 			}
+			
 			elsif ($mode eq 'thread-time') {
 				if ($tag{'<'} eq '/abbr') {
 					$mode = 'thread';
@@ -369,7 +393,7 @@ sub processfile {
 				elsif ($tag{'<'} eq 'a') {
 					if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
 						++$attnumber;
-						$thread{'img-'.$attnumber}=$2;
+						$thread{'img-'.$attnumber}='a_'.$2;
 						$mode = 'thread-attachment-img';
 					}
 					elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
@@ -387,7 +411,7 @@ sub processfile {
 						$thread{'imgkey-'.$attnumber}=$imgkey;
 					}
 					else {
-						$thread{'img-'.$attnumber}=undef;
+						delete $thread{'img-'.$attnumber};
 						--$attnumber;
 					}
 				}
@@ -402,7 +426,7 @@ sub processfile {
 				}
 				elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) {
 					my $imgurl = urldecode($3);
-					my $imgid='';
+					my $imgid='i_';
 					$imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
 					
 					while(length($imgurl)>240) {
@@ -426,12 +450,244 @@ sub processfile {
 					$mode = 'thread-attachment-link';
 				}
 			}
+			
+			
+			elsif ($mode eq 'posts') {
+				if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
+					%post = ();
+					
+					$post{'id'} = $1;
+					$post{'threadid'} = $id;
+					$post{'groupid'} = $$settings{'id'};
+					$post{'timenumber'} = $timenumber;
+					
+					$mode = 'post';
+					$level=0;
+					$attnumber=0;
+					print "Post $post{'id'}\n";
+				}
+			}
+			
+			elsif ($mode eq 'post') {
+				if ($tag {'<'} eq 'h3') {
+					$mode = 'post-author';
+				}
+				elsif ($tag{'<'} eq 'abbr') {
+					$mode = 'post-time';
+				}
+				if ($tag{'<'} eq 'div') {
+					if(($tag{'class'} eq '')and($post{'content'} eq '')) {
+						$mode = 'post-content';
+						$level2=0;
+						$ignoretext=0;
+						$hidename=0;
+						$link=0;
+					}
+					else {
+						++$level;
+					}
+				}
+				elsif ($tag{'<'} eq '/div') {
+					if ($level) {
+						--$level;
+					}
+					else {
+						$mode = 'posts';
+						
+						my $postfile;
+						my $postpath = ARCH_PATH.$$settings{'id'}.'/';
+						unless (-d $postpath) {
+							unless (mkdir $postpath) {
+								print "Can't mkdir $postpath.\n";
+							}
+						}
+						$postpath.='post/';
+						unless (-d $postpath) {
+							unless (mkdir $postpath) {
+								print "Can't mkdir $postpath.\n";
+							}
+						}
+						$postpath.=$thread{'id'}.'/';
+						unless (-d $postpath) {
+							unless (mkdir $postpath) {
+								print "Can't mkdir $postpath.\n";
+							}
+						}
+						$postpath.=$post{'id'};
+						
+						if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
+							if (flock ($postfile, 2)) {
+								%post2 = readdatafile($postfile);
+								
+								if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) {
+									print ("Newer version already saved.\n\n");
+								}
+								else {
+									foreach my $ind (keys %post2) {
+										if($ind =~ /^img(key)?-[0-9]+$/) {
+											delete $post2{$ind};
+										}
+									}
+									foreach my $ind (keys %post) {
+										$post2{$ind}=$post{$ind};
+									}
+									if ($post2{'key'} eq '') {
+										$post2{'key'} = key(KEY_BITS);
+									}
+									if (seek($postfile, 0, 0)) {
+										writedatafile($postfile,%post2);
+										truncate ($postfile , tell($postfile));
+								
+										foreach my $ind (keys %post2) {
+											print "$ind: $post2{$ind}\n";
+										}
+										print "saved.\n\n";
+									}
+									else {
+										print "Can't seek $postfile.\n\n";
+									}
+								}
+							}
+							else {
+								print "Can't lock $postfile.\n\n";
+							}
+							close ($postfile);
+						}
+						else
+						{
+							print "Can't open $postpath.\n\n";
+						}
+					}
+				}
+				
+				elsif ($tag{'<'} eq 'a') {
+					if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+						++$attnumber;
+						$post{'img-'.$attnumber}='a_'.$2;
+						$mode = 'post-img';
+					}
+					elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) {
+						$mode = 'post-replies';
+					}
+				}
+			}
+			
+			elsif ($mode eq 'post-img') {
+				if ($tag{'<'} eq 'img') {
+					my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'});
+					if ($imgkey ne '') {
+						$post{'imgkey-'.$attnumber}=$imgkey;
+					}
+					else {
+						delete $post{'img-'.$attnumber};
+						--$attnumber;
+					}
+				}
+				elsif ($tag{'<'} eq '/a') {
+					$mode = 'post';
+				}
+			}
+			
+			
+			elsif ($mode eq 'post-author') {
+				if ($tag{'<'} eq 'a') {
+					if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+						my $author = $1;
+						if ($post{'author'} eq '') {
+							$post{'author'} = $author;
+							$post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
+						}
+					}
+				}
+				elsif ($tag{'<'} eq '/h3') {
+					$mode='post';
+				}
+			}
+			
+			elsif ($mode eq 'post-content') {
+				if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
+					++$level2;
+					$post{'content'}.='<div>';
+				}
+				elsif ($tag{'<'} eq '/div') {
+					if($level2){
+						--$level2;
+						$post{'content'}.='</div>';
+					}
+					else {
+						$mode = 'post';
+					}
+				}
+				elsif ($tag{'<'} eq 'p') {
+					$post{'content'}.='<p>';
+				}
+				elsif ($tag{'<'} eq '/p') {
+					$post{'content'}.='</p>';
+				}
+				elsif (!$ignoretext) {
+					if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
+						my $imgurl = urldecode($2);
+						my $imgid = 'r_';
+						$imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
+						
+						while(length($imgurl)>240) {
+							$imgid.=substr($imgurl,0,120).'-/';
+							$imgurl=substr($imgurl,120);
+						}
+						$imgid.=$imgurl;
+					
+						my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
+						if ($imgkey ne '') {
+							$post{'content'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
+						}
+						else {
+							$post{'content'}.='<img src="" alt="'.$tag{'alt'}.'">';
+						}
+						
+					}
+					elsif ($tag{'<'} eq 'a') {
+						if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+							$post{'content'}.='<a href="'.urldecode($3).'">';
+							$link=1;
+						}
+						elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
+							$post{'content'}.='<a href="#">'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'});
+							$link=1;
+							$hidename=1;
+						}
+					}
+					elsif ($tag{'<'} eq '/a') {
+						if($link) {
+							$post{'content'}.='</a>';
+							$link=0;
+							$hidename=0;
+						}
+					}
+					else {
+					$post{'content'}.='<!'.$tag{'<'}.'!>';
+					}
+				}
+				
+			}
+			
+			elsif ($mode eq 'post-time') {
+				if ($tag{'<'} eq '/abbr') {
+					$mode = 'post';
+				}
+			}
+			
+			elsif ($mode eq 'post-replies') {
+				if ($tag{'<'} eq '/a') {
+					$mode = 'post';
+				}
+			}
+			
+			
 			if ($tag{"\\"} ne '') {
 				$closetag = 1;
 				next;
 			}
 			
-			
 			local $/ = '<';
 			unless (defined ($text = <$contentfile>)) {
 				close($contentfile);
@@ -455,18 +711,24 @@ sub processfile {
 				$thread{'linktext-'.$attnumber}.=$text;
 			}
 			
-			# print "text: $text\n";
+			if($mode eq 'post-content') {
+				unless ($ignoretext or $hidename){
+					$post{'content'}.=$text;
+				}
+			}
+			elsif ($mode eq 'post-time') {
+				$post{'timetext'}.=$text;
+			}
+			elsif ($mode eq 'post-replies') {
+				if($text =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
+					$post{'replies'} = $1;
+				}
+			}
+			
 		}
 		
 		
 		
-		# while (defined ($line = <$contentfile>)) {
-			# if ($firstline) {
-				# $firstline = 0;
-				# next;
-			# }
-			# # print "$line\n";
-		# }
 		close ($contentfile);
 		
 	}
@@ -494,12 +756,12 @@ sub readheaderfile {
 		my $headname='';
 		my $headval='';
 		
-		if($line =~ /^[ \t]+([^ \t].*)$/){
+		if($line =~ /^[ \t]+([^ \t](.*[^ \t])?)[ \t]*$/){
 			if($lastname ne '') {
 				$header{$lastname}.=$1;
 			}
 		}
-		elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
+		elsif ($line =~ /^([^:]+):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) {
 			$headname = lc($1);
 			$headval = $2;
 			
@@ -520,6 +782,95 @@ sub readheaderfile {
 	return %header;
 }
 
+# Very similar to header file reading.
+# Differences:
+#
+# After field name and colon there must be exactly one whitespace (space or
+# tab). Any other leading or trailing whitespace (but not the newline character
+# at the end of the line) is treated as part of the field value.
+#
+# When header field is split into multiple lines the next lines must start with
+# exactly one whitespace (tab or space) Any other leading or trailing whitespace
+# (but not the newline character at the end of the line) is treated as part of
+# the field value. the lines will be joined with a newline between them.
+#
+# When the same field name appears it replaces the previous one.
+sub readdatafile {
+	(my $headerpath) = @_;
+	my $headerfile;
+	my %header;
+	
+	if(ref($headerpath)) {
+		$headerfile=$headerpath;
+	}
+	else {
+		unless (open ($headerfile, "<", $headerpath)) {
+			return %header;
+		}
+	}
+	
+	my $lastname='';
+	
+	while (defined(my $line = <$headerfile>)) {
+		$line =~ s/[\r\n]$//g;
+		my $headname='';
+		my $headval='';
+		
+		if($line =~ /^[ \t](.*)$/){
+			if($lastname ne '') {
+				$header{$lastname}.="\n".$1;
+			}
+		}
+		elsif ($line =~ /^([^:]+):[ \t](.*)$/) {
+			$headname = lc($1);
+			$headval = $2;
+			
+			# if ($header{$headname} ne '') {
+				# $header{$headname}.=$headval;
+			# }
+			# else {
+				$header{$headname}=$headval;
+			# }
+			$lastname = $headname;
+		}
+	}
+	
+	unless (ref($headerpath)) {
+		close ($headerfile);
+	}
+	
+	return %header;
+}
+
+sub writedatafile {
+	(my $headerpath, my %header) = @_;
+	my $headerfile;
+	
+	if(ref($headerpath)) {
+		$headerfile=$headerpath;
+	}
+	else {
+		unless (open ($headerfile, ">", $headerpath)) {
+			return 0;
+		}
+	}
+	
+	foreach my $ind (keys %header) {
+		my $headname = $ind;
+		my $headval = $header{$ind};
+		$headval =~ s/\r//g;
+		$headval =~ s/\n/\n /g;
+		print $headerfile "$headname: $headval\n";
+	}
+	print $headerfile "\n";
+	
+	unless (ref($headerpath)) {
+		close ($headerfile);
+	}
+	
+	return 1;
+}
+
 sub gettimenumber {
 	(my $date) = @_;
 	my $year;
@@ -672,16 +1023,31 @@ sub saveimg {
 	(my $prot, my $host, my $port, my $path, my $query);
 	my @stat;
 	
-	print "  Image $id\n";
+	print "Image $id\n";
+	
+	$basepath=ARCH_PATH.$groupid.'/';
+	unless (-d $basepath){
+		unless (mkdir $basepath) {
+			print "Can't mkdir $basepath.\n";
+			return '';
+		}
+	}
+	$basepath.='image/';
+	unless (-d $basepath){
+		unless (mkdir $basepath) {
+			print "Can't mkdir $basepath.\n";
+			return '';
+		}
+	}
+	
 	
-	$basepath=ARCH_PATH.$groupid.'/image/';
 	my $idtemp = $id;
 	while((my $ind = index($idtemp,'/'))>=0) {
 		$basepath.=substr($idtemp, 0, $ind+1);
 		$idtemp=substr($idtemp,$ind+1);
 		unless (-d $basepath){
 			unless (mkdir $basepath) {
-				print "  Can't mkdir $basepath.\n";
+				print "Can't mkdir $basepath.\n";
 				return '';
 			}
 		}
@@ -696,7 +1062,7 @@ sub saveimg {
 			if (flock ($headfile, 2)) {
 				%header = readheaderfile($headfile);
 				if ($header{'key'} ne '') {
-					print "  Already saved.\n\n";
+					print "Already saved.\n\n";
 					close($headfile);
 					return $header{'key'};
 				}
@@ -705,7 +1071,7 @@ sub saveimg {
 				}
 			}
 			else {
-				print "  Can't lock $headpath.\n\n";
+				print "Can't lock $headpath.\n\n";
 				close($headfile);
 				return '';
 			}
@@ -716,7 +1082,7 @@ sub saveimg {
 	$archbasepath = urldiv2path($prot, $host, $port, $path, $query);
 	$archheadpath = $archbasepath.'@h';
 	$archimgpath = $archbasepath.'@v';
-	print "  url: $url\n";
+	print "url: $url\n";
 	for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
 		%archheader = readheaderfile($archheadpath); 
 		
@@ -728,7 +1094,7 @@ sub saveimg {
 			if ($location !~ /^[a-z]+:\/\//) {
 				$location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;
 			}
-			print "  Redirect: $location\n";
+			print "Redirect: $location\n";
 			($prot, $host, $port, $path, $query) = divideurl($location);
 			$archbasepath = urldiv2path($prot, $host, $port, $path, $query);
 			$archheadpath = $archbasepath.'@h';
@@ -739,7 +1105,7 @@ sub saveimg {
 		}
 	}
 	if ($archheader{'status'} !~ /^200 /) {
-		print "  Not found ($url $archheader{'status'}).\n\n";
+		print "Not found ($archheader{'status'}).\n\n";
 		if ($headopen) {
 			close ($headfile);
 		}
@@ -756,7 +1122,7 @@ sub saveimg {
 		$header{'content-length'}=$stat[7];
 	}
 	else {
-		print "  Can't stat $imgpath. $stat[7]\n";
+		print "Can't stat $imgpath. $stat[7]\n";
 	}
 	$header{'id'}=$id;
 	$header{'groupid'}=$groupid;
@@ -764,18 +1130,18 @@ sub saveimg {
 	
 	if ($headopen) {
 		unless (seek($headfile,0,0)) {
-			print "  Can't seek $headpath.\n\n";
+			print "Can't seek $headpath.\n\n";
 			close ($headfile);
 			return '';
 		}
 	}
 	else{
 		unless (open($headfile,">",$headpath)) {
-			print "  Can't open $headpath.\n\n";
+			print "Can't open $headpath.\n\n";
 			return '';
 		}
 		unless (flock ($headfile, 2)) {
-			print "  Can't lock $headpath.\n\n";
+			print "Can't lock $headpath.\n\n";
 			close($headfile);
 			return '';
 		}
@@ -783,13 +1149,13 @@ sub saveimg {
 	
 	foreach my $ind (keys %header) {
 		print $headfile "$ind: $header{$ind}\n";
-		print "  $ind: $header{$ind}\n";
+		print "$ind: $header{$ind}\n";
 	}
 	print $headfile "\n";
 	close ($headfile);
 	
 	unless (copy($archimgpath,$imgpath)) {
-		print "  Can't copy $archimgpath.\n\n";
+		print "Can't copy $archimgpath.\n\n";
 		return '';
 	}
 	print "saved.\n\n";