From: b Date: Thu, 17 Dec 2015 19:05:36 +0000 (+0000) Subject: Explanation X-Git-Url: http://bicyclesonthemoon.info/git-projects/?a=commitdiff_plain;h=dc3647a7990e62d28fd86dc9202ad303bc74b5fe;p=yplom%2Ffacebug1 Explanation git-svn-id: svn://botcastle1b/yplom/facebug1@9 7dec801f-c475-4e67-ba99-809552d69c55 --- diff --git a/bot.1.pl b/bot.1.pl index de81ef3..1ddcf58 100644 --- a/bot.1.pl +++ b/bot.1.pl @@ -1,5 +1,11 @@ ###PERL; +# bot is generated from bot.1.pl +# +# This is the facebook bot. It depends on the proxy. +# It reads pages from m.facebook.com archived on the proxy, extracts threads, +# posts, images, etc. from a facebook group and saves them. + use strict; use Fcntl; use File::Copy; @@ -16,11 +22,16 @@ my $time = time(); srand ($time-$$); print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n"; +# If there are commandline arguments the bot will only process the facebook +# group(s) defined by them. The arguments are numbers which are the group's ID +# and also the name of the config file. if (scalar @ARGV) { for (my $ind=0; $ind < scalar @ARGV; ++$ind) { processgroup($ARGV[$ind]); } } +# If there were no commandline arguments the bot will process all groups which +# have a config file defined. else { my $dir; my $subpath; @@ -32,6 +43,8 @@ else { } } +# The function to process one facebook group. The argument is a number which is +# the group's ID and also the config file name. sub processgroup { (my $filenumber) = @_; my $settingspath; @@ -41,6 +54,7 @@ sub processgroup { my %names; my $archpath; + # ID must be a number if ($filenumber =~ /^([0-9]+)$/) { $settingspath = GROUPSETTINGS_PATH.$1; } @@ -48,9 +62,15 @@ sub processgroup { return; } + # read the configuration file %settings = readconfigfile($settingspath); + # The settings defined so far: + # id - the group's ID. SHOULD be the same as the file name. + # hidenames - defines if the real names of people will be hidden or converted + # "0" or "no" means no, anything else means yes. default value if + # undefined is yes. Not implemented yet (maybe never will). Now it + # will always hide names. - #The group id SHOULD be the filename. But what if it isn't? if ($settings{'id'} =~ /^([0-9]+)$/) { $groupid=$1; } @@ -58,6 +78,7 @@ sub processgroup { return; } + #read the list of names. A default name MUST be defined. $namespath = GROUPSETTINGS_PATH.$groupid.'-names'; %names = readconfigfile($namespath); if ($names{'default'} eq '') { @@ -65,17 +86,21 @@ sub processgroup { } print "Group $groupid\n"; - + # process the pages with threads: $archpath = url2path('https://m.facebook.com/groups/'.$groupid); processdir($archpath.'@q/',\%settings,\%names,0); processdir($archpath.'/@q/',\%settings,\%names,0); - + # process the pages with posts: $archpath = url2path('https://m.facebook.com/comment/replies'); processdir($archpath.'@q/',\%settings,\%names,1); processdir($archpath.'/@q/',\%settings,\%names,1); } +# The function to process all files in a directory and (recursively) all +# subdirectories. +# The first argument is the directory path (ending with "/"). +# The three other arguments are passed to processfile(). sub processdir { (my $dirpath, my $settings, my $names, my $pagemode) = @_; my $dir; @@ -88,6 +113,7 @@ sub processdir { } while (defined($subpath = readdir $dir)) { $subpathfull=$dirpath.$subpath; + # "." or ".." should be ignored if ($subpath =~ /^\.\.?$/) { next; } @@ -98,9 +124,21 @@ sub processdir { processdir($subpathfull.'/', $settings, $names, $pagemode); } } - closedir ($dir); + closedir ($dir); } +# The function to process one archived page. +# Arguments: +# 1. The path to the archived header. Ends with "@h". Otherwise function just +# returns without doing anything. +# 2. reference to the hash with group settings +# 3. reference to the hash with names +# 4. mode, determines how pages are interpreted. If 0 page type can be +# 'thread' - one thread +# 'group' - list of threads' firstposts +# If nonzero page type can be +# 'post' - one post (with replies) +# sub processfile { (my $headerpath, my $settings, my $names, my $pagemode) = @_; my $basepath; @@ -117,19 +155,20 @@ sub processfile { my %cgi; - my $postid; - my $threadid; - my $groupid=0; ###! - my $timenumber; + my $postid; #id of post + my $threadid; #id of thread + my $groupid=0; #determines if group id was found on page. REAL group id is in $$settings{'id'}! + my $timenumber; #number, determines when page was saved on the proxy. - my %thread; - my %thread2; + my %thread; #thread info, created by interpreting the page + my %thread2; #thread info, created by reading previously archived file - my %post; - my %post2; + my %post; #post info, created by interpreting the page + my %post2; #post info, created by reading previously archived file - my $pagetype; + my $pagetype; #type of page: 'group', 'thread' or 'post' + # Argument must be HEADER path. if ($headerpath =~ /^((.+)\@h)$/) { $headerpath = $1; $basepath = $2; @@ -142,7 +181,8 @@ sub processfile { ($prot, $host, $port, $path, $query) = path2urldiv($basepath); print 'Page '.joinurl($prot, $host, $port, $path, $query)."\n"; - ### REDESIGN THE CONDITIONS! + + #Determine what type of page it is. If none ot the three types - return. if($query ne '') { %cgi=getcgi($query); if($pagemode) { @@ -176,14 +216,14 @@ sub processfile { $pagetype = 'group'; } } - print " type=$pagetype\n"; + print "type=$pagetype\n"; + # Read the http header. Only interested in the status. If a redirection then + # follow it and read again. If 200 it's ok to continue processing. Otherwise + # return. for (my $ind=0; $ind levels the bot went + my $level2; # same as above used in different state (previous must be kept) + my $closetag=0;# if there is a tag to close + my $ignoretext;# if text should be ignored and not added to post/thread content + my $link; # if bot is inside a link + my $hidename; # if bot is inside a part which contains a name to be hidden + my $attnumber; # number of current attachment + my $incomplete;# if thread firstpost's content is incomplete + my $firstpost; # if the bot is in the firstpost (important if pagetype='post') + + # Set initial values depending on page type. + if ($pagetype eq 'thread') { + print "Thread $threadid\n"; + $thread{'id'}=$threadid; + $thread{'groupid'}=$$settings{'id'}; + $thread{'timenumber'}=$timenumber; + $mode = 'thread'; + $level=0; + $attnumber=0; + $incomplete=0; + } + elsif ($pagetype eq 'post'){ + print "Post $postid ($threadid)\n"; - unless (open ($contentfile, "<",$contentpath)) { - print "Can't open $contentpath.\n"; - return; + $mode='posts'; + $firstpost=1; + } + else { #group + print "Threads\n"; + $mode = 'threads'; + } + + my $line; + + # The main loop. It reads the file like this: + # texttexttext... + # If there two tags next to each other there still is text + # between them, only with zero length. + # It the tag looks like this: + # + # the bot will interpret it as + # + # without the 0-length text between. + # + # This is a state machine + # In the main loop the tag is read and depending on the tag's content, state + # and some variable values some actions can be taken. + # Then the text is read and again depending ot the text, state and some + # variable values some actions can be taken. + # This loop continues until the end of file. + + #read and ignore text before the first tag. + local $/ = '<'; + unless (defined ($text = <$contentfile>)) { + close($contentfile); + return; + } + # main loop + while ($mode ne '') { + # if there was a tag ending with "/>" in the previous iteration then in this + # one it will be treated as the same tag but with "'; + unless (defined ($text = <$contentfile>)) { + close($contentfile); + return; + } + $text =~ s/>$//; + # # DEBUG: + # if($pagetype eq 'thread'){ + # print ">>$mode: <$text>\n"; + # } - $mode='posts'; - $firstpost=1; - } - else { #group - print "Threads\n"; - $mode = 'threads'; + # get the attributes of the tag + # special values: '<' - tag name (with or without '/'), '/' - when it's a + # closing tag, '\' when tag ends with "/>". + %tag = taginfo($text); } + local $/ = "\n"; - my $line; - - local $/ = '<'; - unless (defined ($text = <$contentfile>)) { - close($contentfile); - return; + # List of threads. Look for
s with threads + if ($mode eq 'threads'){ + # Thread found. Id not known yet! + if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) { + print "Thread [$1]\n"; + $mode = 'thread'; + # set initial values + %thread = (); + $thread{'groupid'}=$$settings{'id'}; + $thread{'timenumber2'}=$timenumber; + $level = 0; + $attnumber=0; + $incomplete=0; + } } - while ($mode ne '') { - if ($closetag){ - $tag{'<'} = '/'.$tag{'<'}; - $tag{'/'}='/'; - delete $tag{"\\"}; - $closetag=0; + + # one thread + elsif ($mode eq 'thread'){ + # thread author is in first

+ if ($tag{'<'} eq 'h3') { + $mode = 'thread-author'; } - else { - local $/ = '>'; - unless (defined ($text = <$contentfile>)) { - close($contentfile); - return; + elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) { + # Post content is (always?) in the first
with a 2 letter class name after author + if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) { + $mode='thread-content'; + $level2=0; + $ignoretext=1; # text in firstposts only inside

+ $hidename=0; + $link=0; + } + else { + ++$level; } - $text =~ s/>$//; - # # DEBUG: - # if($pagetype eq 'thread'){ - # print ">>$mode: <$text>\n"; - # } - %tag = taginfo($text); } - local $/ = "\n"; + elsif (($tag{'<'} eq '/div') and $level) { + --$level; + } - if ($mode eq 'threads'){ - if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) { - print "Thread [$1]\n"; - $mode = 'thread'; - %thread = (); - $thread{'groupid'}=$$settings{'id'}; - $thread{'timenumber2'}=$timenumber; - $level = 0; - $attnumber=0; - $incomplete=0; - } + # the time text is in the only in thread firstpost + elsif ($tag{'<'} eq 'abbr') { + $mode = 'thread-time'; } - elsif ($mode eq 'thread'){ - # print "+++$text+++\n"; - if ($tag{'<'} eq 'h3') { - $mode = 'thread-author'; + elsif ($tag{'<'} eq 'a') { + # there is an image attached + if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { + ++$attnumber; + $thread{'img-'.$attnumber}='a_'.$2; + $mode = 'thread-attachment-img'; } - elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) { - # These are very 'helpful' class names, facebug, thank you! - # After recent changes do I still need the unreliable class name? - # Is post content always in first

after author with a 2 letter class name? - # Let's test: - if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) { - # if (($tag{'class'} =~ /^(bj|bk|bm)|(db|da)$/) and (!defined($thread{'postcontent'}))) { - $mode='thread-content'; - $level2=0; - $ignoretext=1; - $hidename=0; - $link=0; - } - # elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)|(dc|db)$/)) { ### NAMES NOT RELIABLE! HAVE TO IMPROVE SERIOUSLY! - # $mode='thread-attachment'; - # $level2=0; - # $attnumber=0; - # } - else { - ++$level; + # there is a link attached + elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + ++$attnumber; + $thread{'link-'.$attnumber}=urldecode($3); + $mode = 'thread-attachment-link'; + } + # if thread id is not known it can be determined from this link. + # also, the number of replies may be found in one of these links. + elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) { + if ($thread{'id'} eq '') { + $thread{'id'} = $2; + print "Thread $thread{'id'}\n"; } + $mode = 'thread-replies'; } - elsif (($tag{'<'} eq '/div') and $level) { - --$level; + } + + # Depending on page mode the thread and firstpost information can end in + # two different ways. When page type is 'thread' the end is at the
+ # whose id starts with "ufi_". When page type is 'group' it ends when + # leaving the thread-related div + # + elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) { + # depending on page type the rest of the page contains posts or other + # threads. + if ($pagetype eq 'thread') { + $mode='posts'; } - - elsif ($tag{'<'} eq 'abbr') { - $mode = 'thread-time'; + else { + $mode='threads'; } - elsif ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { - ++$attnumber; - $thread{'img-'.$attnumber}='a_'.$2; - $mode = 'thread-attachment-img'; - } - elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { - ++$attnumber; - $thread{'link-'.$attnumber}=urldecode($3); - $mode = 'thread-attachment-link'; + # Now prepare to save the file + my $threadfile; + my $threadpath = ARCH_PATH.$$settings{'id'}.'/'; + unless (-d $threadpath) { + unless (mkdir $threadpath) { + print "Can't mkdir $threadpath.\n"; } - elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) { - if ($thread{'id'} eq '') { - $thread{'id'} = $2; - print "Thread $thread{'id'}\n"; - } - $mode = 'thread-replies'; + } + $threadpath.='thread/'; + unless (-d $threadpath) { + unless (mkdir $threadpath) { + print "Can't mkdir $threadpath.\n"; } } + $threadpath.=$thread{'id'}; - elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) { - if ($pagetype eq 'thread') { - $mode='posts'; - } - else { - $mode='threads'; - } - - my $threadfile; - my $threadpath = ARCH_PATH.$$settings{'id'}.'/'; - unless (-d $threadpath) { - unless (mkdir $threadpath) { - print "Can't mkdir $threadpath.\n"; - } - } - $threadpath.='thread/'; - unless (-d $threadpath) { - unless (mkdir $threadpath) { - print "Can't mkdir $threadpath.\n"; + if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) { + if (flock ($threadfile, 2)) { + # read the data already saved in file + %thread2 = readdatafile($threadfile); + + # in 'threads' page type the firstpost's content may be incomplete. + # in that case it should not be written to file if there is already + # one. Even if the file has an older version. + # but in this page type there is information about the number of + # replies, not found in 'thread' page type. This information should + # be written even when the post contnet shouldn't. + # + # That's why a thread has the timenumber and timenumber2. + # timenumber defines the time of the post content. in pagetype + # 'thread' only timenumber is checked and only timenumber is + # updated. + # timenumber2 defines the time of information only available in the + # 'group' page type. In this pagetype the post content is only + # updated if complete and timenumber allows it. Other information is + # updated if timenumber2 allows it + + # Don't overwrite newer information with older. + if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) { + print ("Newer version already saved.\n\n"); } - } - $threadpath.=$thread{'id'}; - - # if (open ($threadfile, "+<", $threadpath)) { - if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) { - if (flock ($threadfile, 2)) { - %thread2 = readdatafile($threadfile); - - if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) { - print ("Newer version already saved.\n\n"); - } - else { - if($pagetype ne 'thread'){ - if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) { - print ("Newer version of post content already saved.\n"); - delete $thread{'postcontent'}; - } - elsif($incomplete) { - print ("Post content incomplete.\n"); - if(defined($thread2{'postcontent'})){ - delete $thread{'postcontent'}; - } - } - else { - $thread{'timenumber'}=$thread{'timenumber2'}; - } + else { + if($pagetype ne 'thread'){ + # Don't overwrite newer post content with older. + if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) { + print ("Newer version of post content already saved.\n"); + delete $thread{'postcontent'}; } - foreach my $ind (keys %thread2) { - if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) { - delete $thread2{$ind}; + elsif($incomplete) { + # Don't overwrite complete post content with incomplete one. + # Write incomplete content if nothing was archived before, + # better this than nothing. + print ("Post content incomplete.\n"); + if(defined($thread2{'postcontent'})){ + delete $thread{'postcontent'}; } } - foreach my $ind (keys %thread) { - $thread2{$ind}=$thread{$ind}; + else { + $thread{'timenumber'}=$thread{'timenumber2'}; } - if ($thread2{'key'} eq '') { - $thread2{'key'} = key(KEY_BITS); + } + # delete previous information about attachments - the numbers + # could have changed. + foreach my $ind (keys %thread2) { + if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) { + delete $thread2{$ind}; } + } + + # overwrite previous information with new one + foreach my $ind (keys %thread) { + $thread2{$ind}=$thread{$ind}; + } + if ($thread2{'key'} eq '') { + $thread2{'key'} = key(KEY_BITS); + } + + # write data to file + if (seek($threadfile, 0, 0)) { + writedatafile($threadfile,%thread2); + truncate ($threadfile , tell($threadfile)); - if (seek($threadfile, 0, 0)) { - writedatafile($threadfile,%thread2); - truncate ($threadfile , tell($threadfile)); - - foreach my $ind (keys %thread2) { - print "$ind: $thread2{$ind}\n"; #### - } - print "saved.\n\n"; - } - else { - print "Can't seek $threadfile.\n\n"; + foreach my $ind (keys %thread2) { + print "$ind: $thread2{$ind}\n"; #### } + print "saved.\n\n"; + } + else { + print "Can't seek $threadfile.\n\n"; } - } - else { - print "Can't lock $threadfile.\n\n"; - } - close ($threadfile); - } - else - { - print "Can't open $threadpath.\n\n"; - } - } - } - - elsif ($mode eq 'thread-author') { - if ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { - my $author = $1; - if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { - $author = urldecode($2); - } - if ($thread{'author'} eq '') { - $thread{'author'} = $author; - $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; - } - } - } - elsif ($tag{'<'} eq '/h3') { - $mode='thread'; - } - } - - elsif ($mode eq 'thread-content') { - if ($tag{'<'} eq 'div') { # There should not be any sub
s! - ++$level2; - # $thread{'postcontent'}.='
'; - $ignoretext=1; - } - elsif ($tag{'<'} eq '/div') { - if($level2){ - --$level2; - # $thread{'postcontent'}.='
'; - unless($level2) { - $ignoretext=0; } } else { - $mode = 'thread'; + print "Can't lock $threadfile.\n\n"; } + close ($threadfile); } - elsif ($tag{'<'} eq 'br') { - $thread{'postcontent'}.='
'; - $ignoretext=0; + else + { + print "Can't open $threadpath.\n\n"; } - elsif ($tag{'<'} eq 'p') { - $thread{'postcontent'}.='

'; - $ignoretext=0; - } - elsif ($tag{'<'} eq '/p') { - $thread{'postcontent'}.='

'; - $ignoretext=1; - } - elsif (!$ignoretext) { - if ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { - $thread{'postcontent'}.=''; - $link=1; - } - elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { - my $person=$1; - if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { - $person = urldecode($2); - } - $thread{'postcontent'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); - $link=1; - $hidename=1; - } - } - elsif ($tag{'<'} eq '/a') { - if($link) { - $thread{'postcontent'}.=''; - $link=0; - $hidename=0; - } - } - else { - # $thread{'postcontent'}.=''; + } + } + + # author name + elsif ($mode eq 'thread-author') { + # name can be found in hyperlinks + if ($tag{'<'} eq 'a') { + # there are two types of facebook user IDs + if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { + my $author = $1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $author = urldecode($2); } - } - elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) { - unless($incomplete) { - $thread{'postcontent'}.='

Post not completely archived.

'; + if ($thread{'author'} eq '') { + $thread{'author'} = $author; + $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; } - $incomplete=1; } } - - elsif ($mode eq 'thread-time') { - if ($tag{'<'} eq '/abbr') { - $mode = 'thread'; - } + # go out of

+ elsif ($tag{'<'} eq '/h3') { + $mode='thread'; } - - - # elsif ($mode eq 'thread-attachment') { - # if ($tag{'<'} eq 'div') { - # ++$level2; - # } - # elsif ($tag{'<'} eq '/div') { - # if($level2){ - # --$level2; - # } - # else { - # $mode = 'thread'; - # } - # } - # elsif ($tag{'<'} eq 'a') { - # if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { - # ++$attnumber; - # $thread{'img-'.$attnumber}='a_'.$2; - # $mode = 'thread-attachment-img'; - # } - # elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { - # ++$attnumber; - # $thread{'link-'.$attnumber}=urldecode($3); - # $mode = 'thread-attachment-link'; - # } - # } - # } - - elsif ($mode eq 'thread-attachment-img') { - if ($tag{'<'} eq 'img') { - my $imgkey = saveimg($tag{'src'},$thread{'img-'.$attnumber},$$settings{'id'}); - if ($imgkey ne '') { - $thread{'imgkey-'.$attnumber}=$imgkey; - } - else { - delete $thread{'img-'.$attnumber}; - --$attnumber; + } + + # the firstpost's content + elsif ($mode eq 'thread-content') { + # There should not be any sub
s. Ignore everything inside. + if ($tag{'<'} eq 'div') { + ++$level2; + $ignoretext=1; + } + elsif ($tag{'<'} eq '/div') { + if($level2){ + --$level2; + unless($level2) { + $ignoretext=0; } } - elsif ($tag{'<'} eq '/a') { + else { $mode = 'thread'; } } - - elsif ($mode eq 'thread-attachment-link') { - if($tag{'<'} eq 'h3'){ - $mode = 'thread-attachment-link-title'; - } - elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) { - my $imgurl = urldecode($3); - my $imgid='i_'; + elsif ($tag{'<'} eq 'br') { + $thread{'postcontent'}.='
'; + $ignoretext=0; + } + elsif ($tag{'<'} eq 'p') { + $thread{'postcontent'}.='

'; + $ignoretext=0; + } + elsif ($tag{'<'} eq '/p') { + $thread{'postcontent'}.='

'; + $ignoretext=1; + } + elsif (!$ignoretext) { + # inline image (smiley?) + if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) { + my $imgurl = urldecode($2); + my $imgid = 'r_'; $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; while(length($imgurl)>240) { @@ -583,373 +605,488 @@ sub processfile { $imgurl=substr($imgurl,120); } $imgid.=$imgurl; + my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'}); if ($imgkey ne '') { - $thread{'img-'.$attnumber}=$imgid; - $thread{'imgkey-'.$attnumber}=$imgkey; + $thread{'postcontent'}.=''.$tag{'alt'}.''; + } + else { + $thread{'postcontent'}.=''.$tag{'alt'}.''; + } + } + elsif ($tag{'<'} eq 'a') { + # a link to an external page + if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + $thread{'postcontent'}.=''; + $link=1; + } + # a link to a user + elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { + my $person=$1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $person = urldecode($2); + } + $thread{'postcontent'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); + $link=1; + $hidename=1; } } elsif ($tag{'<'} eq '/a') { - $mode = 'thread'; + if($link) { + $thread{'postcontent'}.=''; + $link=0; + $hidename=0; + } + } + else { + # $thread{'postcontent'}.=''; } } - - elsif ($mode eq 'thread-attachment-link-title') { - if ($tag{'<'} eq '/h3') { - $mode = 'thread-attachment-link'; + # a link for "more..." outside the

s = past incomplete! + elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) { + unless($incomplete) { + $thread{'postcontent'}.='

Post not completely archived.

'; } + $incomplete=1; } - - elsif ($mode eq 'thread-replies') { - if ($tag{'<'} eq '/a') { - $mode = 'thread'; + } + + # time + elsif ($mode eq 'thread-time') { + if ($tag{'<'} eq '/abbr') { + $mode = 'thread'; + } + } + + # an attached image + elsif ($mode eq 'thread-attachment-img') { + if ($tag{'<'} eq 'img') { + my $imgkey = saveimg($tag{'src'},$thread{'img-'.$attnumber},$$settings{'id'}); + if ($imgkey ne '') { + $thread{'imgkey-'.$attnumber}=$imgkey; + } + else { + delete $thread{'img-'.$attnumber}; + --$attnumber; } } - - - elsif ($mode eq 'posts') { - if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) { - %post = (); - - $post{'id'} = $1; - $post{'threadid'} = $threadid; - $post{'groupid'} = $$settings{'id'}; - $post{'timenumber'} = $timenumber; - - $mode = 'post'; - $level=0; - $attnumber=0; - - if($pagetype eq 'post') { - if(!$groupid) { - print "Can't determine if post belongs to group $$settings{'id'}.\n"; - $mode=''; - last; - } - elsif($post{'id'} eq $postid) { - $firstpost=1; - } - else { - $firstpost=0; - $post{'postid'} = $postid; - } - } - print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n"; + elsif ($tag{'<'} eq '/a') { + $mode = 'thread'; + } + } + + #an attached link + elsif ($mode eq 'thread-attachment-link') { + #"title" is found in

+ if($tag{'<'} eq 'h3'){ + $mode = 'thread-attachment-link-title'; + } + # an image included to illustrate the attached link. Facebook automatically chooses it... + elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) { + my $imgurl = urldecode($3); + my $imgid='i_'; + $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; + + while(length($imgurl)>240) { + $imgid.=substr($imgurl,0,120).'-/'; + $imgurl=substr($imgurl,120); + } + $imgid.=$imgurl; + my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'}); + if ($imgkey ne '') { + $thread{'img-'.$attnumber}=$imgid; + $thread{'imgkey-'.$attnumber}=$imgkey; } - elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) { - if ($1 eq $$settings{'id'}) { - $groupid = 1; + } + elsif ($tag{'<'} eq '/a') { + $mode = 'thread'; + } + } + + elsif ($mode eq 'thread-attachment-link-title') { + if ($tag{'<'} eq '/h3') { + $mode = 'thread-attachment-link'; + } + } + + elsif ($mode eq 'thread-replies') { + if ($tag{'<'} eq '/a') { + $mode = 'thread'; + } + } + + # list of posts. look for
s with posts. + elsif ($mode eq 'posts') { + # post found + if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) { + %post = (); + + $post{'id'} = $1; + $post{'threadid'} = $threadid; + $post{'groupid'} = $$settings{'id'}; + $post{'timenumber'} = $timenumber; + + $mode = 'post'; + $level=0; + $attnumber=0; + + # if page type is 'post' the URL is not enough to see which group the + # posts belongs to. has to be found on the page BEFORE entering first + # post
! + if($pagetype eq 'post') { + if(!$groupid) { + print "Can't determine if post belongs to group $$settings{'id'}.\n"; + $mode=''; + last; + } + elsif($post{'id'} eq $postid) { + # In this page type it's important to check if this post is a reply + # to a thread or to a post. + $firstpost=1; } else { - print "Post does not belong to group $$settings{'id'}.\n"; - $mode = ''; - last; + $firstpost=0; + $post{'postid'} = $postid; } } + print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n"; } - - elsif ($mode eq 'post') { - if ($tag {'<'} eq 'h3') { - $mode = 'post-author'; + # Link can be used to determine which group the post belongs to. + elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) { + if ($1 eq $$settings{'id'}) { + $groupid = 1; } - elsif ($tag{'<'} eq 'abbr') { - $mode = 'post-time'; + else { + print "Post does not belong to group $$settings{'id'}.\n"; + $mode = ''; + last; } - if ($tag{'<'} eq 'div') { - if(($tag{'class'} eq '')and(!defined($post{'content'}))) { - $mode = 'post-content'; - $level2=0; - $ignoretext=0; - $hidename=0; - $link=0; - } - else { - ++$level; - } + } + } + + # one post + elsif ($mode eq 'post') { + # thread author is in first

+ if ($tag {'<'} eq 'h3') { + $mode = 'post-author'; + } + elsif ($tag{'<'} eq 'abbr') { + $mode = 'post-time'; + } + if ($tag{'<'} eq 'div') { + # Post content is (always?) in the first
without a class name after author + if(($tag{'class'} eq '')and(!defined($post{'content'}))) { + $mode = 'post-content'; + $level2=0; + $ignoretext=0; + $hidename=0; + $link=0; + } + else { + ++$level; + } + } + elsif ($tag{'<'} eq '/div') { + if ($level) { + --$level; } - elsif ($tag{'<'} eq '/div') { - if ($level) { - --$level; + else { + # end of post
. Now prepare to save the file. + # path depends if it's a post or a postreply. + $mode = 'posts'; + + my $postfile; + my $postpath = ARCH_PATH.$$settings{'id'}.'/'; + unless (-d $postpath) { + unless (mkdir $postpath) { + print "Can't mkdir $postpath.\n"; + } + } + if(($pagemode eq 'post')and !$firstpost){ + $postpath.='postreply/'; } else { - $mode = 'posts'; - - my $postfile; - my $postpath = ARCH_PATH.$$settings{'id'}.'/'; - unless (-d $postpath) { - unless (mkdir $postpath) { - print "Can't mkdir $postpath.\n"; - } - } - if(($pagemode eq 'post')and !$firstpost){ - $postpath.='postreply/'; + $postpath.='post/'; + } + unless (-d $postpath) { + unless (mkdir $postpath) { + print "Can't mkdir $postpath.\n"; } - else { - $postpath.='post/'; + } + $postpath.=$post{'threadid'}.'/'; + unless (-d $postpath) { + unless (mkdir $postpath) { + print "Can't mkdir $postpath.\n"; } + } + if(($pagemode eq 'post')and !$firstpost){ + $postpath.=$post{'postid'}.'/'; unless (-d $postpath) { unless (mkdir $postpath) { print "Can't mkdir $postpath.\n"; } } - $postpath.=$post{'threadid'}.'/'; - unless (-d $postpath) { - unless (mkdir $postpath) { - print "Can't mkdir $postpath.\n"; + } + + $postpath.=$post{'id'}; + + if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) { + if (flock ($postfile, 2)) { + %post2 = readdatafile($postfile); + + # Don't overwrite newer information with older. + if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) { + print ("Newer version already saved.\n\n"); } - } - if(($pagemode eq 'post')and !$firstpost){ - $postpath.=$post{'postid'}.'/'; - unless (-d $postpath) { - unless (mkdir $postpath) { - print "Can't mkdir $postpath.\n"; + else { + # delete previous information about attachments - the numbers + # could have changed. + foreach my $ind (keys %post2) { + if($ind =~ /^img(key)?-[0-9]+$/) { + delete $post2{$ind}; + } } - } - } - - $postpath.=$post{'id'}; - - if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) { - if (flock ($postfile, 2)) { - %post2 = readdatafile($postfile); - - if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) { - print ("Newer version already saved.\n\n"); + # overwrite previous information with new one + foreach my $ind (keys %post) { + $post2{$ind}=$post{$ind}; } - else { + if ($post2{'key'} eq '') { + $post2{'key'} = key(KEY_BITS); + } + # write data to file + if (seek($postfile, 0, 0)) { + writedatafile($postfile,%post2); + truncate ($postfile , tell($postfile)); + foreach my $ind (keys %post2) { - if($ind =~ /^img(key)?-[0-9]+$/) { - delete $post2{$ind}; - } - } - foreach my $ind (keys %post) { - $post2{$ind}=$post{$ind}; - } - if ($post2{'key'} eq '') { - $post2{'key'} = key(KEY_BITS); - } - if (seek($postfile, 0, 0)) { - writedatafile($postfile,%post2); - truncate ($postfile , tell($postfile)); - - foreach my $ind (keys %post2) { - print "$ind: $post2{$ind}\n"; - } - print "saved.\n\n"; - } - else { - print "Can't seek $postfile.\n\n"; + print "$ind: $post2{$ind}\n"; } + print "saved.\n\n"; + } + else { + print "Can't seek $postfile.\n\n"; } } - else { - print "Can't lock $postfile.\n\n"; - } - close ($postfile); } - else - { - print "Can't open $postpath.\n\n"; + else { + print "Can't lock $postfile.\n\n"; } + close ($postfile); } - } - - elsif ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { - ++$attnumber; - $post{'img-'.$attnumber}='a_'.$2; - $mode = 'post-img'; - } - elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) { - $mode = 'post-replies'; + else + { + print "Can't open $postpath.\n\n"; } } } - elsif ($mode eq 'post-img') { - if ($tag{'<'} eq 'img') { - my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'}); - if ($imgkey ne '') { - $post{'imgkey-'.$attnumber}=$imgkey; - } - else { - delete $post{'img-'.$attnumber}; - --$attnumber; - } + elsif ($tag{'<'} eq 'a') { + # there is an image attached + if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) { + ++$attnumber; + $post{'img-'.$attnumber}='a_'.$2; + $mode = 'post-img'; } - elsif ($tag{'<'} eq '/a') { - $mode = 'post'; + # the number of replies may be found in one of these links. + elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) { + $mode = 'post-replies'; } } - - - elsif ($mode eq 'post-author') { - if ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { - my $author = $1; - if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { - $author = urldecode($2); - } - if ($post{'author'} eq '') { - $post{'author'} = $author; - $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; - } - } + } + + # an attached image + elsif ($mode eq 'post-img') { + if ($tag{'<'} eq 'img') { + my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'}); + if ($imgkey ne '') { + $post{'imgkey-'.$attnumber}=$imgkey; } - elsif ($tag{'<'} eq '/h3') { - $mode='post'; + else { + delete $post{'img-'.$attnumber}; + --$attnumber; } } - - elsif ($mode eq 'post-content') { - if ($tag{'<'} eq 'div') { # There should not be any sub
s! - ++$level2; - # $post{'content'}.='
'; - $ignoretext=1; - } - elsif ($tag{'<'} eq '/div') { - if($level2){ - --$level2; - # $post{'content'}.='
'; - unless($level2){ - $ignoretext=0; - } + elsif ($tag{'<'} eq '/a') { + $mode = 'post'; + } + } + + elsif ($mode eq 'post-author') { + # name can be found in hyperlinks + if ($tag{'<'} eq 'a') { + # there are two types of facebook user IDs + if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) { + my $author = $1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $author = urldecode($2); } - else { - $mode = 'post'; + if ($post{'author'} eq '') { + $post{'author'} = $author; + $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'}; } } - elsif ($tag{'<'} eq 'br') { - $post{'content'}.='
'; - } - elsif ($tag{'<'} eq 'p') { - $post{'content'}.='

'; + } + elsif ($tag{'<'} eq '/h3') { + $mode='post'; + } + } + + # the post content + elsif ($mode eq 'post-content') { + # There should not be any sub

s. Ignore everything inside. + if ($tag{'<'} eq 'div') { + ++$level2; + $ignoretext=1; + } + elsif ($tag{'<'} eq '/div') { + if($level2){ + --$level2; + unless($level2){ + $ignoretext=0; + } } - elsif ($tag{'<'} eq '/p') { - $post{'content'}.='

'; + else { + $mode = 'post'; } - elsif (!$ignoretext) { - if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) { - my $imgurl = urldecode($2); - my $imgid = 'r_'; - $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; - - while(length($imgurl)>240) { - $imgid.=substr($imgurl,0,120).'-/'; - $imgurl=substr($imgurl,120); - } - $imgid.=$imgurl; + } + elsif ($tag{'<'} eq 'br') { + $post{'content'}.='
'; + } + elsif ($tag{'<'} eq 'p') { + $post{'content'}.='

'; + } + elsif ($tag{'<'} eq '/p') { + $post{'content'}.='

'; + } + elsif (!$ignoretext) { + if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) { + my $imgurl = urldecode($2); + my $imgid = 'r_'; + $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg; - my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'}); - if ($imgkey ne '') { - $post{'content'}.=''.$tag{'alt'}.''; - } - else { - $post{'content'}.=''.$tag{'alt'}.''; - } - - } - elsif ($tag{'<'} eq 'a') { - if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { - $post{'content'}.=''; - $link=1; - } - elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { - my $person = $1; - if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { - $person = urldecode($2); - } - $post{'content'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); - $link=1; - $hidename=1; - } + while(length($imgurl)>240) { + $imgid.=substr($imgurl,0,120).'-/'; + $imgurl=substr($imgurl,120); } - elsif ($tag{'<'} eq '/a') { - if($link) { - $post{'content'}.=''; - $link=0; - $hidename=0; - } + $imgid.=$imgurl; + + my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'}); + if ($imgkey ne '') { + $post{'content'}.=''.$tag{'alt'}.''; } else { - # $post{'content'}.=''; + $post{'content'}.=''.$tag{'alt'}.''; } } - - } - - elsif ($mode eq 'post-time') { - if ($tag{'<'} eq '/abbr') { - $mode = 'post'; - } - } - - elsif ($mode eq 'post-replies') { - if ($tag{'<'} eq '/a') { - $mode = 'post'; + elsif ($tag{'<'} eq 'a') { + # a link to an external page + if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) { + $post{'content'}.=''; + $link=1; + } + # a link to a user + elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) { + my $person = $1; + if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) { + $person = urldecode($2); + } + $post{'content'}.=''.(($$names{$person} ne '')?$$names{$person}:$$names{'default'}); + $link=1; + $hidename=1; + } } - } - - - if ($tag{"\\"} ne '') { - $closetag = 1; - next; - } - - local $/ = '<'; - unless (defined ($text = <$contentfile>)) { - close($contentfile); - return; - } - local $/ = "\n"; - $text =~ s/<$//; - - # # DEBUG - # if ($pagetype eq 'thread') { - # print ">>$mode: $text\n"; - # } - - if($mode eq 'thread-content') { - unless ($ignoretext or $hidename){ - $thread{'postcontent'}.=$text; + elsif ($tag{'<'} eq '/a') { + if($link) { + $post{'content'}.=''; + $link=0; + $hidename=0; + } } - } - elsif ($mode eq 'thread-time') { - $thread{'timetext'}.=$text; - } - elsif ($mode eq 'thread-attachment-link-title') { - $thread{'linktitle-'.$attnumber}.=$text; - } - elsif ($mode eq 'thread-attachment-link') { - $thread{'linktext-'.$attnumber}.=$text; - } - elsif ($mode eq 'thread-replies') { - if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) { - $thread{'replies'} = $1; + else { + # $post{'content'}.=''; } } - if($mode eq 'post-content') { - unless ($ignoretext or $hidename){ - $post{'content'}.=$text; - } - } - elsif ($mode eq 'post-time') { - $post{'timetext'}.=$text; + } + + elsif ($mode eq 'post-time') { + if ($tag{'<'} eq '/abbr') { + $mode = 'post'; } - elsif ($mode eq 'post-replies') { - if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) { - $post{'replies'} = $1; - } + } + + elsif ($mode eq 'post-replies') { + if ($tag{'<'} eq '/a') { + $mode = 'post'; } - } + # dealing with the tag is finished. + # if the tag ends with "/>" chande it into an ")) { + close($contentfile); + return; + } + local $/ = "\n"; + $text =~ s/<$//; + + # # DEBUG + # if ($pagetype eq 'thread') { + # print ">>$mode: $text\n"; + # } + + # depending on state add text to relevant fields. + + if($mode eq 'thread-content') { + unless ($ignoretext or $hidename){ + $thread{'postcontent'}.=$text; + } + } + # the format facebook uses for showing time is not always helpful (for + # example: "2 mins ago") There is no setting to change it in facebook. + # at least in the m.facebook.com. The bot corrently DOES NOT interpret the + # text. + elsif ($mode eq 'thread-time') { + $thread{'timetext'}.=$text; + } + elsif ($mode eq 'thread-attachment-link-title') { + $thread{'linktitle-'.$attnumber}.=$text; + } + elsif ($mode eq 'thread-attachment-link') { + $thread{'linktext-'.$attnumber}.=$text; + } + elsif ($mode eq 'thread-replies') { + if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) { + $thread{'replies'} = $1; + } + } + + if($mode eq 'post-content') { + unless ($ignoretext or $hidename){ + $post{'content'}.=$text; + } + } + elsif ($mode eq 'post-time') { + $post{'timetext'}.=$text; + } + elsif ($mode eq 'post-replies') { + if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) { + $post{'replies'} = $1; + } + } } + close ($contentfile); } # Function to read data from datafiles. @@ -1031,6 +1168,15 @@ sub readdatafile { return %data; } +# the function to write data to datafiles (see readdatafile() description) +# +# First argument can be a path or a file handle. In case of a file handle it +# will just read the file. In case of path it opens the file before writing and +# closes after. +# +# On failure (file not open) returns 0. +# On success returns 1. +# sub writedatafile { (my $headerpath, my %header) = @_; my $headerfile; @@ -1060,6 +1206,9 @@ sub writedatafile { return 1; } +# Function to get a timenumber from a "date" http header field value. +# It's a 14 digit number: 4 - year, 2 - month, 2 - day, 2 - hour, 2 - minute, +# 2 - second. sub gettimenumber { (my $date) = @_; my $year; @@ -1069,7 +1218,8 @@ sub gettimenumber { my $minute; my $second; - # see https://tools.ietf.org/html/rfc2616#section-3.3.1 + # There are 3 possible formats. + # See https://tools.ietf.org/html/rfc2616#section-3.3.1 if ($date =~ /^[A-Za-z]{3}, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/){ $day=$1; $month=lc($2); @@ -1142,6 +1292,13 @@ sub gettimenumber { return $year.$month.$day.$hour.$minute.$second; } +# Function to get information about a html tag. +# The argument is the tag text without the <>! +# It returns a hash with the attributes' values. +# special values: +# '<' - the tag name (may start with '/'), +# '/' - when the tag starts with '/', +# '\' - when the tag ends with '/', sub taginfo { (my $tagtext) = @_; my %tag; @@ -1169,15 +1326,11 @@ sub taginfo { if ($tagtext =~ /\/[ \t\r\n]*$/) { $tag{"\\"}="\\"; } - # if ($tag{'<'} eq 'div') { - # foreach my $ind (keys %tag) { - # print "$ind: $tag{$ind}\n"; - # } - # print "\n"; - # } return %tag; } +# Function to generate a random hexadecimal number (key) with a defined number +# of bits. sub key { (my $bits) = @_; my $p = int($bits / 16); @@ -1194,6 +1347,13 @@ sub key { return $keytext; } +# The function to save an image from proxy archive +# Arguments: +# 1 - url of image +# 2 - image id +# 3 - group id +# Returns 1 on success and 0 on failure. +# sub saveimg { (my $url, my $id, my $groupid) = @_; @@ -1214,6 +1374,7 @@ sub saveimg { print "Image $id\n"; + # prepare path $basepath=ARCH_PATH.$groupid.'/'; unless (-d $basepath){ unless (mkdir $basepath) { @@ -1229,7 +1390,6 @@ sub saveimg { } } - my $idtemp = $id; while((my $ind = index($idtemp,'/'))>=0) { $basepath.=substr($idtemp, 0, $ind+1); @@ -1246,6 +1406,7 @@ sub saveimg { $headpath=$basepath.'@h'; $imgpath=$basepath.'@v'; + # check if image already saved if ( -f $imgpath) { if (open($headfile,"+<",$headpath)) { if (flock ($headfile, 2)) { @@ -1272,6 +1433,9 @@ sub saveimg { $archheadpath = $archbasepath.'@h'; $archimgpath = $archbasepath.'@v'; print "url: $url\n"; + # Read the http header. Only interested in the status. If a redirection then + # follow it and read again. If 200 it's ok to continue processing. Otherwise + # return. for (my $ind=0; $ind) { +while (defined(my $line = <$configfile>)) { $line =~ s/[\r\n]//g; - $line =~ s/#.*$//; - if ($line =~ /^ *([a-zA-Z0-9_]+) *= *(.*)$/){ - $name=$1; - $value=$2; - $value =~ s/ *$//; + $line =~ s/#.*$//; #comment + if ($line =~ /^[ \t]*([a-zA-Z0-9_\-\.]+)[ \t]*=[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/){ + my $name=$1; + my $value=$2; $set{$name}=$value; } } @@ -47,6 +46,8 @@ $def{'PATH'} = "\$ENV{'PATH'} = '".$set{'path'}."';"; $def{'PERL'} = "#!".$set{'perl'}; +$def{'BOT_CRONTAB'} = $set{'bot_crontab'}.' '.$set{'bin_path'}.'bot'.(($set{'bot_args'} ne '')?(' '.$set{'bot_args'}):''); + $def{'CC'} = 'CC='.$set{'gcc'}; $def{'CF'} = 'CF='.$set{'c_flags'}; $def{'PL'} = 'PL='.$set{'perl'}; @@ -61,7 +62,7 @@ $def{'CM'} = 'CM='.$set{'chmod'}; # ###SOME_NAME; # If found - replace. -while ($line = ) { +while (defined($line = )) { $line =~ s/[\r\n]//g; if ($line =~ /###([a-zA-Z0-9_]+);/) { print "$def{$1}\n"; diff --git a/makefile.1.mak b/makefile.1.mak index f905b36..f502c24 100644 --- a/makefile.1.mak +++ b/makefile.1.mak @@ -9,7 +9,7 @@ ###CM; ###OD; -all: moveout copyout remove #config.txt +all: moveout copyout remove config.txt moveout: bot setuid exec @@ -27,5 +27,9 @@ exec: bot remove: copyout moveout setuid exec # $(RM) proxy.c access.c + bot: bot.1.pl configure.pl settings $(PL) configure.pl settings bot + +config.txt: config.1.txt configure.pl settings + $(PL) configure.pl settings config.txt diff --git a/readthis.txt b/readthis.txt new file mode 100644 index 0000000..d58a85d --- /dev/null +++ b/readthis.txt @@ -0,0 +1,51 @@ +This is the facebook bot and the interface +It depends on the proxy and some other software: +-Apache2 (2.2) (only the interface - not written yet) +-Perl +-curl +-gzip (only for compressing old log files) +and for compilation: +-cp +-move +-rm +-chmod + +It might work with other versions of Apache2 It may work with other +www server software. + +Recommended situation is when the software and data directories belong to a +dedicated user account. The same user who also has the proxy. + +data_path, tmp_path, log_path should only be accessible by this user. +bin_path should be publicly accessible - the programs will be called from the +server. Some will have the SETUID bit set. + +To compile/install: + +Log in to the user account that will own the software. +(If not, you will have to change file ownerships later.) +Edit the file 'settings' to have values relevant to your server/computer. +Create the directories defined there and set correct permissions and ownership. +Run 'make.sh'. It will generate the programs and copy them to the correct +location. +It will also generate config.txt. +Open this file and copy its fragments to your Apache2 config and crontab. +#Restart Apache2. (no need for this yet) + +To add a facebook group: + +Create a file in /data_path/groupsettings. The file name is a number - the +group id. In this file define the following settings: +id - the group id +hidenames - defines if the real names of people will be hidden or converted + "0" or "no" means no, anything else means yes. default value if + undefined is yes. Not implemented yet (maybe never will). Now it + will always hide names. +create another file, add "-names" to the group id for the file name. +In this file should be name mappings: +facebook_id = filtered_name +There MUST be a default name +default = filtered_name. + +The bot accepts group IDs as commandline arguments. If there are none the bot +will process all groups that have a config file. \ No newline at end of file diff --git a/settings b/settings index 0b058ac..2cff094 100644 --- a/settings +++ b/settings @@ -1,18 +1,23 @@ #all directory paths must end with '/' and must already exist. -bin_path = /yplom/bin/facebug1/ -data_path = /yplom/data/facebug1/ -log_path = /yplom/log/facebug1/ -tmp_path = /yplom/tmp/facebug1/ -#www_path = /yplom/www/facebug1/ +bin_path = /yplom/bin/facebug1/ #Where the software will be located +data_path = /yplom/data/facebug1/ #where the software will remember data; subdir: + #group, groupsettings +log_path = /yplom/log/facebug1/ #where the software will remember data +tmp_path = /yplom/tmp/facebug1/ #for temporary fies +www_path = /yplom/www/facebug1/ #for the www server unused -proxy_bin_path = /yplom/bin/proxy/ -proxy_data_path = /yplom/data/proxy/ -proxy_lib_path = /yplom/lib/proxy/ +proxy_bin_path = /yplom/bin/proxy/ #where the proxy software is located +proxy_data_path = /yplom/data/proxy/ #where the proxy remembers data +proxy_lib_path = /yplom/lib/proxy/ #where the proxy library is located -path = /usr/local/bin:/usr/bin:/bin +path = /usr/local/bin:/usr/bin:/bin #The path environment variable. Must be + #overwritten if SETUID. Otherwise + #launching programs may fail. (Perl + #security...) + +#paths to software perl = /usr/bin/perl -curl = /usr/bin/curl chmod = /bin/chmod cp = /bin/cp mv = /bin/mv @@ -31,4 +36,9 @@ logs_total = 10 # How many old logs to keep rm_access_crontab = 0 1 * * * # How often to remove leftover unlock info. cleararch_crontab = 30 0 * * * # How often to clear the archive from old files. oldlogs_crontab = 0 0 * * * # How often to deal with old logs +bot_crontab = 20 2 * * * # How often to run the bot # Not a good idea to launch oldlogs just after cleararch, I think. + +bot_args = 207426296087284 # facebook groups which will be processed by the bot + # numbers separated by spaces. Can be left empty. + # The bot will process all groups then.