From 79ab006c62c1591a923c2e08960b53e1df575d98 Mon Sep 17 00:00:00 2001 From: b Date: Wed, 9 Dec 2015 11:47:32 +0000 Subject: [PATCH] Saving nextposts and images. git-svn-id: svn://botcastle1b/yplom/facebug1@6 7dec801f-c475-4e67-ba99-809552d69c55 --- bot.1.pl | 438 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 402 insertions(+), 36 deletions(-) diff --git a/bot.1.pl b/bot.1.pl index eb9f75c..61b3a77 100644 --- a/bot.1.pl +++ b/bot.1.pl @@ -117,6 +117,9 @@ sub processfile { my %thread; my %thread2; + my %post; + my %post2; + if ($headerpath =~ /^((.+)\@h)$/) { $headerpath = $1; $basepath = $2; @@ -190,6 +193,7 @@ sub processfile { my %tag; my $mode = 'thread'; my $level = 0; + my $level2; my $closetag=0; my $ignoretext; my $link; @@ -229,6 +233,7 @@ sub processfile { $level=0; $ignoretext=1; $hidename=0; + $link=0; } elsif ($tag{'<'} eq 'abbr') { $mode = 'thread-time'; @@ -243,18 +248,34 @@ sub processfile { $mode='posts'; my $threadfile; - my $threadpath = ARCH_PATH.$$settings{'id'}.'/'.'thread/'.$thread{'id'}; + my $threadpath = ARCH_PATH.$$settings{'id'}.'/'; + unless (-d $threadpath) { + unless (mkdir $threadpath) { + print "Can't mkdir $threadpath.\n"; + } + } + $threadpath.='thread/'; + unless (-d $threadpath) { + unless (mkdir $threadpath) { + print "Can't mkdir $threadpath.\n"; + } + } + $threadpath.=$thread{'id'}; # if (open ($threadfile, "+<", $threadpath)) { if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) { if (flock ($threadfile, 2)) { - # In the future it might be necessary to use a different function. - %thread2 = readheaderfile($threadfile); + %thread2 = readdatafile($threadfile); if (($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'})) { print ("Newer version already saved.\n\n"); } else { + foreach my $ind (keys %thread2) { + if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) { + delete $thread2{$ind}; + } + } foreach my $ind (keys %thread) { $thread2{$ind}=$thread{$ind}; } @@ -263,16 +284,16 @@ sub processfile { } if (seek($threadfile, 0, 0)) { + writedatafile($threadfile,%thread2); + truncate ($threadfile , tell($threadfile)); + foreach my $ind (keys %thread2) { - print $threadfile "$ind: $thread2{$ind}\n"; print "$ind: $thread2{$ind}\n"; #### } - print $threadfile "\n"; - truncate ($threadfile , tell($threadfile)); print "saved.\n\n"; } else { - print "Failed seek $threadfile.\n\n"; + print "Can't seek $threadfile.\n\n"; } } } @@ -287,6 +308,7 @@ sub processfile { } } } + elsif ($mode eq 'thread-author') { if ($tag{'<'} eq 'a') { if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { @@ -301,6 +323,7 @@ sub processfile { $mode='thread'; } } + elsif ($mode eq 'thread-content') { if ($tag{'<'} eq 'div') { # There should not be any sub
s! ++$level; @@ -309,7 +332,7 @@ sub processfile { elsif ($tag{'<'} eq '/div') { if($level){ --$level; - $thread{'postcontent'}.='
'; + $thread{'postcontent'}.='
'; } else { $mode = 'thread'; @@ -348,6 +371,7 @@ sub processfile { } } + elsif ($mode eq 'thread-time') { if ($tag{'<'} eq '/abbr') { $mode = 'thread'; @@ -369,7 +393,7 @@ sub processfile { elsif ($tag{'<'} eq 'a') { if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { ++$attnumber; - $thread{'img-'.$attnumber}=$2; + $thread{'img-'.$attnumber}='a_'.$2; $mode = 'thread-attachment-img'; } elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { @@ -387,7 +411,7 @@ sub processfile { $thread{'imgkey-'.$attnumber}=$imgkey; } else { - $thread{'img-'.$attnumber}=undef; + delete $thread{'img-'.$attnumber}; --$attnumber; } } @@ -402,7 +426,7 @@ sub processfile { } elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) { my $imgurl = urldecode($3); - my $imgid=''; + my $imgid='i_'; $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; while(length($imgurl)>240) { @@ -426,12 +450,244 @@ sub processfile { $mode = 'thread-attachment-link'; } } + + + elsif ($mode eq 'posts') { + if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) { + %post = (); + + $post{'id'} = $1; + $post{'threadid'} = $id; + $post{'groupid'} = $$settings{'id'}; + $post{'timenumber'} = $timenumber; + + $mode = 'post'; + $level=0; + $attnumber=0; + print "Post $post{'id'}\n"; + } + } + + elsif ($mode eq 'post') { + if ($tag {'<'} eq 'h3') { + $mode = 'post-author'; + } + elsif ($tag{'<'} eq 'abbr') { + $mode = 'post-time'; + } + if ($tag{'<'} eq 'div') { + if(($tag{'class'} eq '')and($post{'content'} eq '')) { + $mode = 'post-content'; + $level2=0; + $ignoretext=0; + $hidename=0; + $link=0; + } + else { + ++$level; + } + } + elsif ($tag{'<'} eq '/div') { + if ($level) { + --$level; + } + else { + $mode = 'posts'; + + my $postfile; + my $postpath = ARCH_PATH.$$settings{'id'}.'/'; + unless (-d $postpath) { + unless (mkdir $postpath) { + print "Can't mkdir $postpath.\n"; + } + } + $postpath.='post/'; + unless (-d $postpath) { + unless (mkdir $postpath) { + print "Can't mkdir $postpath.\n"; + } + } + $postpath.=$thread{'id'}.'/'; + unless (-d $postpath) { + unless (mkdir $postpath) { + print "Can't mkdir $postpath.\n"; + } + } + $postpath.=$post{'id'}; + + if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) { + if (flock ($postfile, 2)) { + %post2 = readdatafile($postfile); + + if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) { + print ("Newer version already saved.\n\n"); + } + else { + foreach my $ind (keys %post2) { + if($ind =~ /^img(key)?-[0-9]+$/) { + delete $post2{$ind}; + } + } + foreach my $ind (keys %post) { + $post2{$ind}=$post{$ind}; + } + if ($post2{'key'} eq '') { + $post2{'key'} = key(KEY_BITS); + } + if (seek($postfile, 0, 0)) { + writedatafile($postfile,%post2); + truncate ($postfile , tell($postfile)); + + foreach my $ind (keys %post2) { + print "$ind: $post2{$ind}\n"; + } + print "saved.\n\n"; + } + else { + print "Can't seek $postfile.\n\n"; + } + } + } + else { + print "Can't lock $postfile.\n\n"; + } + close ($postfile); + } + else + { + print "Can't open $postpath.\n\n"; + } + } + } + + elsif ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { + ++$attnumber; + $post{'img-'.$attnumber}='a_'.$2; + $mode = 'post-img'; + } + elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) { + $mode = 'post-replies'; + } + } + } + + elsif ($mode eq 'post-img') { + if ($tag{'<'} eq 'img') { + my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'}); + if ($imgkey ne '') { + $post{'imgkey-'.$attnumber}=$imgkey; + } + else { + delete $post{'img-'.$attnumber}; + --$attnumber; + } + } + elsif ($tag{'<'} eq '/a') { + $mode = 'post'; + } + } + + + elsif ($mode eq 'post-author') { + if ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { + my $author = $1; + if ($post{'author'} eq '') { + $post{'author'} = $author; + $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; + } + } + } + elsif ($tag{'<'} eq '/h3') { + $mode='post'; + } + } + + elsif ($mode eq 'post-content') { + if ($tag{'<'} eq 'div') { # There should not be any sub
s! + ++$level2; + $post{'content'}.='
'; + } + elsif ($tag{'<'} eq '/div') { + if($level2){ + --$level2; + $post{'content'}.='
'; + } + else { + $mode = 'post'; + } + } + elsif ($tag{'<'} eq 'p') { + $post{'content'}.='

'; + } + elsif ($tag{'<'} eq '/p') { + $post{'content'}.='

'; + } + elsif (!$ignoretext) { + if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) { + my $imgurl = urldecode($2); + my $imgid = 'r_'; + $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; + + while(length($imgurl)>240) { + $imgid.=substr($imgurl,0,120).'-/'; + $imgurl=substr($imgurl,120); + } + $imgid.=$imgurl; + + my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'}); + if ($imgkey ne '') { + $post{'content'}.=''.$tag{'alt'}.''; + } + else { + $post{'content'}.=''.$tag{'alt'}.''; + } + + } + elsif ($tag{'<'} eq 'a') { + if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + $post{'content'}.=''; + $link=1; + } + elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { + $post{'content'}.=''.(($$names{$1} ne '')?$$names{$1}:$$names{'default'}); + $link=1; + $hidename=1; + } + } + elsif ($tag{'<'} eq '/a') { + if($link) { + $post{'content'}.=''; + $link=0; + $hidename=0; + } + } + else { + $post{'content'}.=''; + } + } + + } + + elsif ($mode eq 'post-time') { + if ($tag{'<'} eq '/abbr') { + $mode = 'post'; + } + } + + elsif ($mode eq 'post-replies') { + if ($tag{'<'} eq '/a') { + $mode = 'post'; + } + } + + if ($tag{"\\"} ne '') { $closetag = 1; next; } - local $/ = '<'; unless (defined ($text = <$contentfile>)) { close($contentfile); @@ -455,18 +711,24 @@ sub processfile { $thread{'linktext-'.$attnumber}.=$text; } - # print "text: $text\n"; + if($mode eq 'post-content') { + unless ($ignoretext or $hidename){ + $post{'content'}.=$text; + } + } + elsif ($mode eq 'post-time') { + $post{'timetext'}.=$text; + } + elsif ($mode eq 'post-replies') { + if($text =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) { + $post{'replies'} = $1; + } + } + } - # while (defined ($line = <$contentfile>)) { - # if ($firstline) { - # $firstline = 0; - # next; - # } - # # print "$line\n"; - # } close ($contentfile); } @@ -494,12 +756,12 @@ sub readheaderfile { my $headname=''; my $headval=''; - if($line =~ /^[ \t]+([^ \t].*)$/){ + if($line =~ /^[ \t]+([^ \t](.*[^ \t])?)[ \t]*$/){ if($lastname ne '') { $header{$lastname}.=$1; } } - elsif ($line =~ /^([^:]*):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) { + elsif ($line =~ /^([^:]+):[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/) { $headname = lc($1); $headval = $2; @@ -520,6 +782,95 @@ sub readheaderfile { return %header; } +# Very similar to header file reading. +# Differences: +# +# After field name and colon there must be exactly one whitespace (space or +# tab). Any other leading or trailing whitespace (but not the newline character +# at the end of the line) is treated as part of the field value. +# +# When header field is split into multiple lines the next lines must start with +# exactly one whitespace (tab or space) Any other leading or trailing whitespace +# (but not the newline character at the end of the line) is treated as part of +# the field value. the lines will be joined with a newline between them. +# +# When the same field name appears it replaces the previous one. +sub readdatafile { + (my $headerpath) = @_; + my $headerfile; + my %header; + + if(ref($headerpath)) { + $headerfile=$headerpath; + } + else { + unless (open ($headerfile, "<", $headerpath)) { + return %header; + } + } + + my $lastname=''; + + while (defined(my $line = <$headerfile>)) { + $line =~ s/[\r\n]$//g; + my $headname=''; + my $headval=''; + + if($line =~ /^[ \t](.*)$/){ + if($lastname ne '') { + $header{$lastname}.="\n".$1; + } + } + elsif ($line =~ /^([^:]+):[ \t](.*)$/) { + $headname = lc($1); + $headval = $2; + + # if ($header{$headname} ne '') { + # $header{$headname}.=$headval; + # } + # else { + $header{$headname}=$headval; + # } + $lastname = $headname; + } + } + + unless (ref($headerpath)) { + close ($headerfile); + } + + return %header; +} + +sub writedatafile { + (my $headerpath, my %header) = @_; + my $headerfile; + + if(ref($headerpath)) { + $headerfile=$headerpath; + } + else { + unless (open ($headerfile, ">", $headerpath)) { + return 0; + } + } + + foreach my $ind (keys %header) { + my $headname = $ind; + my $headval = $header{$ind}; + $headval =~ s/\r//g; + $headval =~ s/\n/\n /g; + print $headerfile "$headname: $headval\n"; + } + print $headerfile "\n"; + + unless (ref($headerpath)) { + close ($headerfile); + } + + return 1; +} + sub gettimenumber { (my $date) = @_; my $year; @@ -672,16 +1023,31 @@ sub saveimg { (my $prot, my $host, my $port, my $path, my $query); my @stat; - print " Image $id\n"; + print "Image $id\n"; + + $basepath=ARCH_PATH.$groupid.'/'; + unless (-d $basepath){ + unless (mkdir $basepath) { + print "Can't mkdir $basepath.\n"; + return ''; + } + } + $basepath.='image/'; + unless (-d $basepath){ + unless (mkdir $basepath) { + print "Can't mkdir $basepath.\n"; + return ''; + } + } + - $basepath=ARCH_PATH.$groupid.'/image/'; my $idtemp = $id; while((my $ind = index($idtemp,'/'))>=0) { $basepath.=substr($idtemp, 0, $ind+1); $idtemp=substr($idtemp,$ind+1); unless (-d $basepath){ unless (mkdir $basepath) { - print " Can't mkdir $basepath.\n"; + print "Can't mkdir $basepath.\n"; return ''; } } @@ -696,7 +1062,7 @@ sub saveimg { if (flock ($headfile, 2)) { %header = readheaderfile($headfile); if ($header{'key'} ne '') { - print " Already saved.\n\n"; + print "Already saved.\n\n"; close($headfile); return $header{'key'}; } @@ -705,7 +1071,7 @@ sub saveimg { } } else { - print " Can't lock $headpath.\n\n"; + print "Can't lock $headpath.\n\n"; close($headfile); return ''; } @@ -716,7 +1082,7 @@ sub saveimg { $archbasepath = urldiv2path($prot, $host, $port, $path, $query); $archheadpath = $archbasepath.'@h'; $archimgpath = $archbasepath.'@v'; - print " url: $url\n"; + print "url: $url\n"; for (my $ind=0; $ind",$headpath)) { - print " Can't open $headpath.\n\n"; + print "Can't open $headpath.\n\n"; return ''; } unless (flock ($headfile, 2)) { - print " Can't lock $headpath.\n\n"; + print "Can't lock $headpath.\n\n"; close($headfile); return ''; } @@ -783,13 +1149,13 @@ sub saveimg { foreach my $ind (keys %header) { print $headfile "$ind: $header{$ind}\n"; - print " $ind: $header{$ind}\n"; + print "$ind: $header{$ind}\n"; } print $headfile "\n"; close ($headfile); unless (copy($archimgpath,$imgpath)) { - print " Can't copy $archimgpath.\n\n"; + print "Can't copy $archimgpath.\n\n"; return ''; } print "saved.\n\n"; -- 2.30.2