]> bicyclesonthemoon.info Git - yplom/facebug1/commitdiff
Explanation
authorb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Thu, 17 Dec 2015 19:05:36 +0000 (19:05 +0000)
committerb <b@7dec801f-c475-4e67-ba99-809552d69c55>
Thu, 17 Dec 2015 19:05:36 +0000 (19:05 +0000)
git-svn-id: svn://botcastle1b/yplom/facebug1@9 7dec801f-c475-4e67-ba99-809552d69c55

bot.1.pl
config.1.txt [new file with mode: 0644]
configure.pl
makefile.1.mak
readthis.txt [new file with mode: 0644]
settings

index de81ef39a27764ef7aca33b48a25f0e656d0f3e5..1ddcf58ea87310a19adf3e212a2873dbc1927448 100644 (file)
--- a/bot.1.pl
+++ b/bot.1.pl
@@ -1,5 +1,11 @@
 ###PERL;
 
+# bot is generated from bot.1.pl
+#
+# This is the facebook bot. It depends on the proxy.
+# It reads pages from m.facebook.com archived on the proxy, extracts threads,
+# posts, images, etc. from a facebook group and saves them.
+
 use strict;
 use Fcntl;
 use File::Copy;
@@ -16,11 +22,16 @@ my $time = time();
 srand ($time-$$);
 
 print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n";
+# If there are commandline arguments the bot will only process the facebook
+# group(s) defined by them. The arguments are numbers which are the group's ID
+# and also the name of the config file.
 if (scalar @ARGV) {
        for (my $ind=0; $ind < scalar @ARGV; ++$ind) {
                processgroup($ARGV[$ind]);
        }
 }
+# If there were no commandline arguments the bot will process all groups which
+# have a config file defined.
 else {
        my $dir;
        my $subpath;
@@ -32,6 +43,8 @@ else {
        }
 }
 
+# The function to process one facebook group. The argument is a number which is
+# the group's ID and also the config file name.
 sub processgroup {
        (my $filenumber) = @_;
        my $settingspath;
@@ -41,6 +54,7 @@ sub processgroup {
        my %names;
        my $archpath;
        
+       # ID must be a number
        if ($filenumber =~ /^([0-9]+)$/) {
                $settingspath = GROUPSETTINGS_PATH.$1;
        }
@@ -48,9 +62,15 @@ sub processgroup {
                return;
        }
        
+       # read the configuration file
        %settings = readconfigfile($settingspath);
+       # The settings defined so far:
+       # id - the group's ID. SHOULD be the same as the file name.
+       # hidenames - defines if the real names of people will be hidden or converted
+       #             "0" or "no" means no, anything else means yes. default value if
+       #             undefined is yes. Not implemented yet (maybe never will). Now it
+       #             will always hide names.
        
-       #The group id SHOULD be the filename. But what if it isn't?
        if ($settings{'id'} =~ /^([0-9]+)$/) {
                $groupid=$1;
        }
@@ -58,6 +78,7 @@ sub processgroup {
                return;
        }
        
+       #read the list of names. A default name MUST be defined.
        $namespath = GROUPSETTINGS_PATH.$groupid.'-names';
        %names = readconfigfile($namespath);
        if ($names{'default'} eq '') {
@@ -65,17 +86,21 @@ sub processgroup {
        }
        
        print "Group $groupid\n";
-       
+       # process the pages with threads:
        $archpath = url2path('https://m.facebook.com/groups/'.$groupid);
        processdir($archpath.'@q/',\%settings,\%names,0);
        processdir($archpath.'/@q/',\%settings,\%names,0);
-       
+       # process the pages with posts:
        $archpath = url2path('https://m.facebook.com/comment/replies');
        processdir($archpath.'@q/',\%settings,\%names,1);
        processdir($archpath.'/@q/',\%settings,\%names,1);
        
 }
 
+# The function to process all files in a directory and (recursively) all
+# subdirectories.
+# The first argument is the directory path (ending with "/").
+# The three other arguments are passed to processfile().
 sub processdir {
        (my $dirpath, my $settings, my $names, my $pagemode) = @_;
        my $dir;
@@ -88,6 +113,7 @@ sub processdir {
        }
        while (defined($subpath = readdir $dir)) {
                $subpathfull=$dirpath.$subpath;
+               # "." or ".." should be ignored 
                if ($subpath =~ /^\.\.?$/) {
                        next;
                }
@@ -98,9 +124,21 @@ sub processdir {
                        processdir($subpathfull.'/', $settings, $names, $pagemode);
                }
        }
-       closedir ($dir);        
+       closedir ($dir);
 }
 
+# The function to process one archived page.
+# Arguments:
+# 1. The path to the archived header. Ends with "@h". Otherwise function just
+#    returns without doing anything.
+# 2. reference to the hash with group settings
+# 3. reference to the hash with names
+# 4. mode, determines how pages are interpreted. If 0 page type can be
+#    'thread' - one thread
+#    'group' - list of threads' firstposts
+#     If nonzero page type can be
+#     'post' - one post (with replies)
+#
 sub processfile {
        (my $headerpath, my $settings, my $names, my $pagemode) = @_;
        my $basepath;
@@ -117,19 +155,20 @@ sub processfile {
        
        my %cgi;
        
-       my $postid;
-       my $threadid;
-       my $groupid=0; ###!
-       my $timenumber;
+       my $postid;     #id of post
+       my $threadid;   #id of thread
+       my $groupid=0;  #determines if group id was found on page. REAL group id is in $$settings{'id'}!
+       my $timenumber; #number, determines when page was saved on the proxy.
        
-       my %thread;
-       my %thread2;
+       my %thread;  #thread info, created by interpreting the page
+       my %thread2; #thread info, created by reading previously archived file
        
-       my %post;
-       my %post2;
+       my %post;    #post info, created by interpreting the page
+       my %post2;   #post info, created by reading previously archived file
        
-       my $pagetype;
+       my $pagetype; #type of page: 'group', 'thread' or 'post'
        
+       # Argument must be HEADER path.
        if ($headerpath =~ /^((.+)\@h)$/) {
                $headerpath = $1;
                $basepath = $2;
@@ -142,7 +181,8 @@ sub processfile {
        ($prot, $host, $port, $path, $query) = path2urldiv($basepath);
        
        print 'Page '.joinurl($prot, $host, $port, $path, $query)."\n";
-       ### REDESIGN THE CONDITIONS!
+       
+       #Determine what type of page it is. If none ot the three types - return.
        if($query ne '') {
                %cgi=getcgi($query);
                if($pagemode) {
@@ -176,14 +216,14 @@ sub processfile {
                        $pagetype = 'group';
                }
        }
-       print " type=$pagetype\n";
+       print "type=$pagetype\n";
        
+       # Read the http header. Only interested in the status. If a redirection then
+       # follow it and read again. If 200 it's ok to continue processing. Otherwise
+       # return.
        for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
                %header = readheaderfile($headerpath); 
-               if ($header{'status'} =~ /^200 /) {
-                       last;
-               }
-               elsif ($header{'status'} =~ /^30[1237] /) {
+               if ($header{'status'} =~ /^30[1237] /) {
                        my $location;
                        unless (defined($location = $header{'location'})) {
                                return;
@@ -197,10 +237,14 @@ sub processfile {
                        $contentpath = $basepath.'@v';
                }
                else {
-                       return;
+                       last;
                }
        }
+       if ($header{'status'} !~ /^200 /) {
+               return;
+       }
        
+       # find out when the page was archived on the proxy.
        unless ($timenumber = gettimenumber ($header{'date'})) {
                my @stat;
                unless (@stat = stat $contentpath) {
@@ -211,371 +255,349 @@ sub processfile {
                }
        }
        
-       # This condition is redundant now.
-       if ($pagetype) {
+       # Now the bot can start reading the file
+       
+       unless (open ($contentfile, "<",$contentpath)) {
+               print "Can't open $contentpath.\n";
+               return;
+       }
+       
+       my $text;      # piece of text read from the file
+       my %tag;       # html tag
+       my $mode;      # the state of the main state machine
+       my $level;     # keeps track in how many <div> levels the bot went
+       my $level2;    # same as above used in different state (previous must be kept)
+       my $closetag=0;# if there is a tag to close
+       my $ignoretext;# if text should be ignored and not added to post/thread content
+       my $link;      # if bot is inside a link
+       my $hidename;  # if bot is inside a part which contains a name to be hidden
+       my $attnumber; # number of current attachment
+       my $incomplete;# if thread firstpost's content is incomplete 
+       my $firstpost; # if the bot is in the firstpost (important if pagetype='post')
+       
+       # Set initial values depending on page type.
+       if ($pagetype eq 'thread') {
+               print "Thread $threadid\n";
                
+               $thread{'id'}=$threadid;
+               $thread{'groupid'}=$$settings{'id'};
+               $thread{'timenumber'}=$timenumber;
+               $mode = 'thread';
+               $level=0;
+               $attnumber=0;
+               $incomplete=0;
+       }
+       elsif ($pagetype eq 'post'){
+               print "Post $postid ($threadid)\n"; 
                
-               unless (open ($contentfile, "<",$contentpath)) {
-                       print "Can't open $contentpath.\n";
-                       return;
+               $mode='posts';
+               $firstpost=1;
+       }
+       else { #group
+               print "Threads\n";
+               $mode = 'threads';
+       }
+       
+       my $line;
+       
+       # The main loop. It reads the file like this:
+       # <tag>text<tag>text<tag>text...
+       # If there two tags next to each other <tag1><tag2> there still is text
+       # between them, only with zero length.
+       # It the tag looks like this:
+       # <tag name1="value1" />
+       # the bot will interpret it as
+       # <tag name1="value1"></tag>
+       # without the 0-length text between.
+       #
+       # This is a state machine
+       # In the main loop the tag is read and depending on the tag's content, state
+       # and some variable values some actions can be taken.
+       # Then the text is read and again depending ot the text, state and some
+       # variable values some actions can be taken.
+       # This loop continues until the end of file.
+       
+       #read and ignore text before the first tag.
+       local $/ = '<';
+       unless (defined ($text = <$contentfile>)) {
+               close($contentfile);
+               return;
+       }
+       # main loop
+       while ($mode ne '') {
+               # if there was a tag ending with "/>" in the previous iteration then in this
+               # one it will be treated as the same tag but with "</".
+               if ($closetag){
+                       $tag{'<'} = '/'.$tag{'<'};
+                       $tag{'/'}='/';
+                       delete $tag{"\\"};
+                       $closetag=0;
                }
                
-               my $text;
-               my %tag;
-               my $mode;
-               my $level;
-               my $level2;
-               my $closetag=0;
-               my $ignoretext;
-               my $link;
-               my $hidename;
-               my $attnumber;
-               my $incomplete;
-               my $firstpost;
-               
-               if ($pagetype eq 'thread') {
-                       print "Thread $threadid\n";
-                       
-                       $thread{'id'}=$threadid;
-                       $thread{'groupid'}=$$settings{'id'};
-                       $thread{'timenumber'}=$timenumber;
-                       $mode = 'thread';
-                       $level=0;
-                       $attnumber=0;
-                       $incomplete=0;
-               }
-               elsif ($pagetype eq 'post'){
-                       print "Post $postid ($threadid)\n"; 
+               # otherwise read next tag
+               else {
+                       local $/ = '>';
+                       unless (defined ($text = <$contentfile>)) {
+                               close($contentfile);
+                               return;
+                       }
+                       $text =~ s/>$//;
+                       # # DEBUG:
+                       # if($pagetype eq 'thread'){
+                               # print ">>$mode: <$text>\n";
+                       # }
                        
-                       $mode='posts';
-                       $firstpost=1;
-               }
-               else { #group
-                       print "Threads\n";
-                       $mode = 'threads';
+                       # get the attributes of the tag
+                       # special values: '<' - tag name (with or without '/'), '/' - when it's a
+                       # closing tag, '\' when tag ends with "/>".
+                       %tag = taginfo($text);
                }
+               local $/ = "\n";
                
-               my $line;
-               
-               local $/ = '<';
-               unless (defined ($text = <$contentfile>)) {
-                       close($contentfile);
-                       return;
+               # List of threads. Look for <div>s with threads
+               if ($mode eq 'threads'){
+                       # Thread found. Id not known yet!
+                       if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) {
+                               print "Thread [$1]\n";
+                               $mode = 'thread';
+                               # set initial values
+                               %thread = ();
+                               $thread{'groupid'}=$$settings{'id'};
+                               $thread{'timenumber2'}=$timenumber;
+                               $level = 0;
+                               $attnumber=0;
+                               $incomplete=0;
+                       }
                }
-               while ($mode ne '') {
-                       if ($closetag){
-                               $tag{'<'} = '/'.$tag{'<'};
-                               $tag{'/'}='/';
-                               delete $tag{"\\"};
-                               $closetag=0;
+               
+               # one thread
+               elsif ($mode eq 'thread'){
+                       # thread author is in first <h3>
+                       if ($tag{'<'} eq 'h3') {
+                               $mode = 'thread-author';
                        }
-                       else {
-                               local $/ = '>';
-                               unless (defined ($text = <$contentfile>)) {
-                                       close($contentfile);
-                                       return;
+                       elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
+                               # Post content is (always?) in the first <div> with a 2 letter class name after author
+                               if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) {
+                                       $mode='thread-content';
+                                       $level2=0;
+                                       $ignoretext=1; # text in firstposts only inside <p>
+                                       $hidename=0;
+                                       $link=0;
+                               }
+                               else {
+                                       ++$level;
                                }
-                               $text =~ s/>$//;
-                               # # DEBUG:
-                               # if($pagetype eq 'thread'){
-                                       # print ">>$mode: <$text>\n";
-                               # }
-                               %tag = taginfo($text);
                        }
-                       local $/ = "\n";
+                       elsif (($tag{'<'} eq '/div') and $level) {
+                               --$level;
+                       }
                        
-                       if ($mode eq 'threads'){
-                               if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) {
-                                       print "Thread [$1]\n";
-                                       $mode = 'thread';
-                                       %thread = ();
-                                       $thread{'groupid'}=$$settings{'id'};
-                                       $thread{'timenumber2'}=$timenumber;
-                                       $level = 0;
-                                       $attnumber=0;
-                                       $incomplete=0;
-                               }
+                       # the time text is in the only <abbr> in thread firstpost
+                       elsif ($tag{'<'} eq 'abbr') {
+                               $mode = 'thread-time';
                        }
                        
-                       elsif ($mode eq 'thread'){
-                               # print "+++$text+++\n";
-                               if ($tag{'<'} eq 'h3') {
-                                       $mode = 'thread-author';
+                       elsif ($tag{'<'} eq 'a') {
+                               # there is an image attached
+                               if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+                                       ++$attnumber;
+                                       $thread{'img-'.$attnumber}='a_'.$2;
+                                       $mode = 'thread-attachment-img';
                                }
-                               elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
-                                       # These are very 'helpful' class names, facebug, thank you!
-                                       # After recent changes do I still need the unreliable class name?
-                                       # Is post content always in first <div> after author with a 2 letter class name?
-                                       # Let's test:
-                                       if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) {
-                                       # if (($tag{'class'} =~ /^(bj|bk|bm)|(db|da)$/) and (!defined($thread{'postcontent'}))) {
-                                               $mode='thread-content';
-                                               $level2=0;
-                                               $ignoretext=1;
-                                               $hidename=0;
-                                               $link=0;
-                                       }
-                                       # elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)|(dc|db)$/)) { ### NAMES NOT RELIABLE! HAVE TO IMPROVE SERIOUSLY!
-                                               # $mode='thread-attachment';
-                                               # $level2=0;
-                                               # $attnumber=0;
-                                       # }
-                                       else {
-                                               ++$level;
+                               # there is a link attached
+                               elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+                                       ++$attnumber;
+                                       $thread{'link-'.$attnumber}=urldecode($3);
+                                       $mode = 'thread-attachment-link';
+                               }
+                               # if thread id is not known it can be determined from this link.
+                               # also, the number of replies may be found in one of these links.
+                               elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                       if ($thread{'id'} eq '') {
+                                               $thread{'id'} = $2;
+                                               print "Thread $thread{'id'}\n";
                                        }
+                                       $mode = 'thread-replies';
                                }
-                               elsif (($tag{'<'} eq '/div') and $level) {
-                                       --$level;
+                       }
+                       
+                       # Depending on page mode the thread and firstpost information can end in
+                       # two different ways. When page type is 'thread' the end is at the <div>
+                       # whose id starts with "ufi_". When page type is 'group' it ends when
+                       # leaving the thread-related div
+                       # 
+                       elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) {
+                               # depending on page type the rest of the page contains posts or other
+                               # threads.
+                               if ($pagetype eq 'thread') {
+                                       $mode='posts';
                                }
-                               
-                               elsif ($tag{'<'} eq 'abbr') {
-                                       $mode = 'thread-time';
+                               else {
+                                       $mode='threads';
                                }
                                
-                               elsif ($tag{'<'} eq 'a') {
-                                       if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
-                                               ++$attnumber;
-                                               $thread{'img-'.$attnumber}='a_'.$2;
-                                               $mode = 'thread-attachment-img';
-                                       }
-                                       elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
-                                               ++$attnumber;
-                                               $thread{'link-'.$attnumber}=urldecode($3);
-                                               $mode = 'thread-attachment-link';
+                               # Now prepare to save the file 
+                               my $threadfile;
+                               my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
+                               unless (-d $threadpath) {
+                                       unless (mkdir $threadpath) {
+                                               print "Can't mkdir $threadpath.\n";
                                        }
-                                       elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) {
-                                               if ($thread{'id'} eq '') {
-                                                       $thread{'id'} = $2;
-                                                       print "Thread $thread{'id'}\n";
-                                               }
-                                               $mode = 'thread-replies';
+                               }
+                               $threadpath.='thread/';
+                               unless (-d $threadpath) {
+                                       unless (mkdir $threadpath) {
+                                               print "Can't mkdir $threadpath.\n";
                                        }
                                }
+                               $threadpath.=$thread{'id'};
                                
-                               elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) {
-                                       if ($pagetype eq 'thread') {
-                                               $mode='posts';
-                                       }
-                                       else {
-                                               $mode='threads';
-                                       }
-                                       
-                                       my $threadfile;
-                                       my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
-                                       unless (-d $threadpath) {
-                                               unless (mkdir $threadpath) {
-                                                       print "Can't mkdir $threadpath.\n";
-                                               }
-                                       }
-                                       $threadpath.='thread/';
-                                       unless (-d $threadpath) {
-                                               unless (mkdir $threadpath) {
-                                                       print "Can't mkdir $threadpath.\n";
+                               if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) {
+                                       if (flock ($threadfile, 2)) {
+                                               # read the data already saved in file
+                                               %thread2 = readdatafile($threadfile);
+                                               
+                                               # in 'threads' page type the firstpost's content may be incomplete.
+                                               # in that case it should not be written to file if there is already
+                                               # one. Even if the file has an older version.
+                                               # but in this page type there is information about the number of
+                                               # replies, not found in 'thread' page type. This information should
+                                               # be written even when the post contnet shouldn't.
+                                               #
+                                               # That's why a thread has the timenumber and timenumber2.
+                                               # timenumber defines the time of the post content. in pagetype
+                                               # 'thread' only timenumber is checked and only timenumber is
+                                               # updated.
+                                               # timenumber2 defines the time of information only available in the
+                                               # 'group' page type. In this pagetype the post content is only
+                                               # updated if complete and timenumber allows it. Other information is
+                                               # updated if timenumber2 allows it 
+                                               
+                                               # Don't overwrite newer information with older.
+                                               if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) {
+                                                       print ("Newer version already saved.\n\n");
                                                }
-                                       }
-                                       $threadpath.=$thread{'id'};
-                                       
-                                       # if (open ($threadfile, "+<", $threadpath)) {
-                                       if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) {
-                                               if (flock ($threadfile, 2)) {
-                                                       %thread2 = readdatafile($threadfile);
-                                                       
-                                                       if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) {
-                                                               print ("Newer version already saved.\n\n");
-                                                       }
-                                                       else {
-                                                               if($pagetype ne 'thread'){
-                                                                       if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) {
-                                                                               print ("Newer version of post content already saved.\n");
-                                                                               delete $thread{'postcontent'};
-                                                                       }
-                                                                       elsif($incomplete) {
-                                                                               print ("Post content incomplete.\n");
-                                                                               if(defined($thread2{'postcontent'})){
-                                                                                       delete $thread{'postcontent'};
-                                                                               }
-                                                                       }
-                                                                       else {
-                                                                               $thread{'timenumber'}=$thread{'timenumber2'};
-                                                                       }
+                                               else {
+                                                       if($pagetype ne 'thread'){
+                                                               # Don't overwrite newer post content with older.
+                                                               if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) {
+                                                                       print ("Newer version of post content already saved.\n");
+                                                                       delete $thread{'postcontent'};
                                                                }
-                                                               foreach my $ind (keys %thread2) {
-                                                                       if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
-                                                                               delete $thread2{$ind};
+                                                               elsif($incomplete) {
+                                                                       # Don't overwrite complete post content with incomplete one.
+                                                                       # Write incomplete content if nothing was archived before,
+                                                                       # better this than nothing.
+                                                                       print ("Post content incomplete.\n");
+                                                                       if(defined($thread2{'postcontent'})){
+                                                                               delete $thread{'postcontent'};
                                                                        }
                                                                }
-                                                               foreach my $ind (keys %thread) {
-                                                                       $thread2{$ind}=$thread{$ind};
+                                                               else {
+                                                                       $thread{'timenumber'}=$thread{'timenumber2'};
                                                                }
-                                                               if ($thread2{'key'} eq '') {
-                                                                       $thread2{'key'} = key(KEY_BITS);
+                                                       }
+                                                       # delete previous information about attachments - the numbers
+                                                       # could have changed.
+                                                       foreach my $ind (keys %thread2) {
+                                                               if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
+                                                                       delete $thread2{$ind};
                                                                }
+                                                       }
+                                                       
+                                                       # overwrite previous information with new one
+                                                       foreach my $ind (keys %thread) {
+                                                               $thread2{$ind}=$thread{$ind};
+                                                       }
+                                                       if ($thread2{'key'} eq '') {
+                                                               $thread2{'key'} = key(KEY_BITS);
+                                                       }
+                                                       
+                                                       # write data to file
+                                                       if (seek($threadfile, 0, 0)) {
+                                                               writedatafile($threadfile,%thread2);
+                                                               truncate ($threadfile , tell($threadfile));
                                                                
-                                                               if (seek($threadfile, 0, 0)) {
-                                                                       writedatafile($threadfile,%thread2);
-                                                                       truncate ($threadfile , tell($threadfile));
-                                                                       
-                                                                       foreach my $ind (keys %thread2) {
-                                                                               print "$ind: $thread2{$ind}\n"; ####
-                                                                       }
-                                                                       print "saved.\n\n";
-                                                               }
-                                                               else {
-                                                                       print "Can't seek $threadfile.\n\n";
+                                                               foreach my $ind (keys %thread2) {
+                                                                       print "$ind: $thread2{$ind}\n"; ####
                                                                }
+                                                               print "saved.\n\n";
+                                                       }
+                                                       else {
+                                                               print "Can't seek $threadfile.\n\n";
                                                        }
-                                               }
-                                               else {
-                                                       print "Can't lock $threadfile.\n\n";
-                                               }
-                                               close ($threadfile);
-                                       }
-                                       else
-                                       {
-                                               print "Can't open $threadpath.\n\n";
-                                       }
-                               }
-                       }
-                       
-                       elsif ($mode eq 'thread-author') {
-                               if ($tag{'<'} eq 'a') {
-                                       if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
-                                               my $author = $1;
-                                               if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
-                                                       $author = urldecode($2);
-                                               }
-                                               if ($thread{'author'} eq '') {
-                                                       $thread{'author'} = $author;
-                                                       $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
-                                               }
-                                       }
-                               }
-                               elsif ($tag{'<'} eq '/h3') {
-                                       $mode='thread';
-                               }
-                       }
-                       
-                       elsif ($mode eq 'thread-content') {
-                               if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
-                                       ++$level2;
-                                       # $thread{'postcontent'}.='<div>';
-                                       $ignoretext=1;
-                               }
-                               elsif ($tag{'<'} eq '/div') {
-                                       if($level2){
-                                               --$level2;
-                                               # $thread{'postcontent'}.='</div>';
-                                               unless($level2) {
-                                                       $ignoretext=0;
                                                }
                                        }
                                        else {
-                                               $mode = 'thread';
+                                               print "Can't lock $threadfile.\n\n";
                                        }
+                                       close ($threadfile);
                                }
-                               elsif ($tag{'<'} eq 'br') {
-                                       $thread{'postcontent'}.='<br>';
-                                       $ignoretext=0;
+                               else
+                               {
+                                       print "Can't open $threadpath.\n\n";
                                }
-                               elsif ($tag{'<'} eq 'p') {
-                                       $thread{'postcontent'}.='<p>';
-                                       $ignoretext=0;
-                               }
-                               elsif ($tag{'<'} eq '/p') {
-                                       $thread{'postcontent'}.='</p>';
-                                       $ignoretext=1;
-                               }
-                               elsif (!$ignoretext) {
-                                       if ($tag{'<'} eq 'a') {
-                                               if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
-                                                       $thread{'postcontent'}.='<a href="'.urldecode($3).'">';
-                                                       $link=1;
-                                               }
-                                               elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
-                                                       my $person=$1;
-                                                       if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
-                                                               $person = urldecode($2);
-                                                       }
-                                                       $thread{'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
-                                                       $link=1;
-                                                       $hidename=1;
-                                               }
-                                       }
-                                       elsif ($tag{'<'} eq '/a') {
-                                               if($link) {
-                                                       $thread{'postcontent'}.='</a>';
-                                                       $link=0;
-                                                       $hidename=0;
-                                               }
-                                       }
-                                       else {
-                                       # $thread{'postcontent'}.='<!'.$tag{'<'}.'!>';
+                       }
+               }
+               
+               # author name
+               elsif ($mode eq 'thread-author') {
+                       # name can be found in hyperlinks
+                       if ($tag{'<'} eq 'a') {
+                               # there are two types of facebook user IDs
+                               if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+                                       my $author = $1;
+                                       if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                               $author = urldecode($2);
                                        }
-                               }
-                               elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) {
-                                       unless($incomplete) {
-                                               $thread{'postcontent'}.='<p><b>Post not completely archived.</b></p>';
+                                       if ($thread{'author'} eq '') {
+                                               $thread{'author'} = $author;
+                                               $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
                                        }
-                                       $incomplete=1;
                                }
                        }
-                       
-                       elsif ($mode eq 'thread-time') {
-                               if ($tag{'<'} eq '/abbr') {
-                                       $mode = 'thread';
-                               }
+                       # go out of <h3>
+                       elsif ($tag{'<'} eq '/h3') {
+                               $mode='thread';
                        }
-                       
-                       
-                       # elsif ($mode eq 'thread-attachment') {
-                               # if ($tag{'<'} eq 'div') {
-                                       # ++$level2;
-                               # }
-                               # elsif ($tag{'<'} eq '/div') {
-                                       # if($level2){
-                                               # --$level2;
-                                       # }
-                                       # else {
-                                               # $mode = 'thread';
-                                       # }
-                               # }
-                               # elsif ($tag{'<'} eq 'a') {
-                                       # if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
-                                               # ++$attnumber;
-                                               # $thread{'img-'.$attnumber}='a_'.$2;
-                                               # $mode = 'thread-attachment-img';
-                                       # }
-                                       # elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
-                                               # ++$attnumber;
-                                               # $thread{'link-'.$attnumber}=urldecode($3);
-                                               # $mode = 'thread-attachment-link';
-                                       # }
-                               # }
-                       # }
-                       
-                       elsif ($mode eq 'thread-attachment-img') {
-                               if ($tag{'<'} eq 'img') {
-                                       my $imgkey = saveimg($tag{'src'},$thread{'img-'.$attnumber},$$settings{'id'});
-                                       if ($imgkey ne '') {
-                                               $thread{'imgkey-'.$attnumber}=$imgkey;
-                                       }
-                                       else {
-                                               delete $thread{'img-'.$attnumber};
-                                               --$attnumber;
+               }
+               
+               # the firstpost's content
+               elsif ($mode eq 'thread-content') {
+                       # There should not be any sub<div>s. Ignore everything inside.
+                       if ($tag{'<'} eq 'div') {
+                               ++$level2;
+                               $ignoretext=1;
+                       }
+                       elsif ($tag{'<'} eq '/div') {
+                               if($level2){
+                                       --$level2;
+                                       unless($level2) {
+                                               $ignoretext=0;
                                        }
                                }
-                               elsif ($tag{'<'} eq '/a') {
+                               else {
                                        $mode = 'thread';
                                }
                        }
-                       
-                       elsif ($mode eq 'thread-attachment-link') {
-                               if($tag{'<'} eq 'h3'){
-                                       $mode = 'thread-attachment-link-title';
-                               }
-                               elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) {
-                                       my $imgurl = urldecode($3);
-                                       my $imgid='i_';
+                       elsif ($tag{'<'} eq 'br') {
+                               $thread{'postcontent'}.='<br>';
+                               $ignoretext=0;
+                       }
+                       elsif ($tag{'<'} eq 'p') {
+                               $thread{'postcontent'}.='<p>';
+                               $ignoretext=0;
+                       }
+                       elsif ($tag{'<'} eq '/p') {
+                               $thread{'postcontent'}.='</p>';
+                               $ignoretext=1;
+                       }
+                       elsif (!$ignoretext) {
+                               # inline image (smiley?)
+                               if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
+                                       my $imgurl = urldecode($2);
+                                       my $imgid = 'r_';
                                        $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
                                        
                                        while(length($imgurl)>240) {
@@ -583,373 +605,488 @@ sub processfile {
                                                $imgurl=substr($imgurl,120);
                                        }
                                        $imgid.=$imgurl;
+                               
                                        my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
                                        if ($imgkey ne '') {
-                                               $thread{'img-'.$attnumber}=$imgid;
-                                               $thread{'imgkey-'.$attnumber}=$imgkey;
+                                               $thread{'postcontent'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
+                                       }
+                                       else {
+                                               $thread{'postcontent'}.='<img src="" alt="'.$tag{'alt'}.'">';
+                                       }
+                               }
+                               elsif ($tag{'<'} eq 'a') {
+                                       # a link to an external page
+                                       if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+                                               $thread{'postcontent'}.='<a href="'.urldecode($3).'">';
+                                               $link=1;
+                                       }
+                                       # a link to a user
+                                       elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
+                                               my $person=$1;
+                                               if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                                       $person = urldecode($2);
+                                               }
+                                               $thread{'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+                                               $link=1;
+                                               $hidename=1;
                                        }
                                }
                                elsif ($tag{'<'} eq '/a') {
-                                       $mode = 'thread';
+                                       if($link) {
+                                               $thread{'postcontent'}.='</a>';
+                                               $link=0;
+                                               $hidename=0;
+                                       }
+                               }
+                               else {
+                               # $thread{'postcontent'}.='<!'.$tag{'<'}.'!>';
                                }
                        }
-                       
-                       elsif ($mode eq 'thread-attachment-link-title') {
-                               if ($tag{'<'} eq '/h3') {
-                                       $mode = 'thread-attachment-link';
+                       # a link for "more..." outside the <p>s = past incomplete!
+                       elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) {
+                               unless($incomplete) {
+                                       $thread{'postcontent'}.='<p><b>Post not completely archived.</b></p>';
                                }
+                               $incomplete=1;
                        }
-                       
-                       elsif ($mode eq 'thread-replies') {
-                               if ($tag{'<'} eq '/a') {
-                                       $mode = 'thread';
+               }
+               
+               # time
+               elsif ($mode eq 'thread-time') {
+                       if ($tag{'<'} eq '/abbr') {
+                               $mode = 'thread';
+                       }
+               }
+               
+               # an attached image
+               elsif ($mode eq 'thread-attachment-img') {
+                       if ($tag{'<'} eq 'img') {
+                               my $imgkey = saveimg($tag{'src'},$thread{'img-'.$attnumber},$$settings{'id'});
+                               if ($imgkey ne '') {
+                                       $thread{'imgkey-'.$attnumber}=$imgkey;
+                               }
+                               else {
+                                       delete $thread{'img-'.$attnumber};
+                                       --$attnumber;
                                }
                        }
-                       
-                       
-                       elsif ($mode eq 'posts') {
-                               if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
-                                       %post = ();
-                                       
-                                       $post{'id'} = $1;
-                                       $post{'threadid'} = $threadid;
-                                       $post{'groupid'} = $$settings{'id'};
-                                       $post{'timenumber'} = $timenumber;
-                                       
-                                       $mode = 'post';
-                                       $level=0;
-                                       $attnumber=0;
-                                       
-                                       if($pagetype eq 'post') {
-                                               if(!$groupid) {
-                                                       print "Can't determine if post belongs to group $$settings{'id'}.\n";
-                                                       $mode='';
-                                                       last;
-                                               }
-                                               elsif($post{'id'} eq $postid) {
-                                                       $firstpost=1;
-                                               }
-                                               else {
-                                                       $firstpost=0;
-                                                       $post{'postid'} = $postid;
-                                               }
-                                       }
-                                       print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n";
+                       elsif ($tag{'<'} eq '/a') {
+                               $mode = 'thread';
+                       }
+               }
+               
+               #an attached link
+               elsif ($mode eq 'thread-attachment-link') {
+                       #"title" is found in <h3> 
+                       if($tag{'<'} eq 'h3'){
+                               $mode = 'thread-attachment-link-title';
+                       }
+                       # an image included to illustrate the attached link. Facebook automatically chooses it...
+                       elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) {
+                               my $imgurl = urldecode($3);
+                               my $imgid='i_';
+                               $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
+                               
+                               while(length($imgurl)>240) {
+                                       $imgid.=substr($imgurl,0,120).'-/';
+                                       $imgurl=substr($imgurl,120);
+                               }
+                               $imgid.=$imgurl;
+                               my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
+                               if ($imgkey ne '') {
+                                       $thread{'img-'.$attnumber}=$imgid;
+                                       $thread{'imgkey-'.$attnumber}=$imgkey;
                                }
-                               elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) {
-                                       if ($1 eq $$settings{'id'}) {
-                                               $groupid = 1;
+                       }
+                       elsif ($tag{'<'} eq '/a') {
+                               $mode = 'thread';
+                       }
+               }
+               
+               elsif ($mode eq 'thread-attachment-link-title') {
+                       if ($tag{'<'} eq '/h3') {
+                               $mode = 'thread-attachment-link';
+                       }
+               }
+               
+               elsif ($mode eq 'thread-replies') {
+                       if ($tag{'<'} eq '/a') {
+                               $mode = 'thread';
+                       }
+               }
+               
+               # list of posts. look for <div>s with posts.
+               elsif ($mode eq 'posts') {
+                       # post found
+                       if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
+                               %post = ();
+                               
+                               $post{'id'} = $1;
+                               $post{'threadid'} = $threadid;
+                               $post{'groupid'} = $$settings{'id'};
+                               $post{'timenumber'} = $timenumber;
+                               
+                               $mode = 'post';
+                               $level=0;
+                               $attnumber=0;
+                               
+                               # if page type is 'post' the URL is not enough to see which group the
+                               # posts belongs to. has to be found on the page BEFORE entering first
+                               # post <div>!
+                               if($pagetype eq 'post') {
+                                       if(!$groupid) {
+                                               print "Can't determine if post belongs to group $$settings{'id'}.\n";
+                                               $mode='';
+                                               last;
+                                       }
+                                       elsif($post{'id'} eq $postid) {
+                                               # In this page type it's important to check if this post is a reply
+                                               # to a thread or to a post.
+                                               $firstpost=1;
                                        }
                                        else {
-                                               print "Post does not belong to group $$settings{'id'}.\n";
-                                               $mode = '';
-                                               last;
+                                               $firstpost=0;
+                                               $post{'postid'} = $postid;
                                        }
                                }
+                               print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n";
                        }
-                       
-                       elsif ($mode eq 'post') {
-                               if ($tag {'<'} eq 'h3') {
-                                       $mode = 'post-author';
+                       # Link can be used to determine which group the post belongs to.
+                       elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) {
+                               if ($1 eq $$settings{'id'}) {
+                                       $groupid = 1;
                                }
-                               elsif ($tag{'<'} eq 'abbr') {
-                                       $mode = 'post-time';
+                               else {
+                                       print "Post does not belong to group $$settings{'id'}.\n";
+                                       $mode = '';
+                                       last;
                                }
-                               if ($tag{'<'} eq 'div') {
-                                       if(($tag{'class'} eq '')and(!defined($post{'content'}))) {
-                                               $mode = 'post-content';
-                                               $level2=0;
-                                               $ignoretext=0;
-                                               $hidename=0;
-                                               $link=0;
-                                       }
-                                       else {
-                                               ++$level;
-                                       }
+                       }
+               }
+               
+               # one post
+               elsif ($mode eq 'post') {
+                       # thread author is in first <h3>
+                       if ($tag {'<'} eq 'h3') {
+                               $mode = 'post-author';
+                       }
+                       elsif ($tag{'<'} eq 'abbr') {
+                               $mode = 'post-time';
+                       }
+                       if ($tag{'<'} eq 'div') {
+                               # Post content is (always?) in the first <div> without a class name after author
+                               if(($tag{'class'} eq '')and(!defined($post{'content'}))) {
+                                       $mode = 'post-content';
+                                       $level2=0;
+                                       $ignoretext=0;
+                                       $hidename=0;
+                                       $link=0;
+                               }
+                               else {
+                                       ++$level;
+                               }
+                       }
+                       elsif ($tag{'<'} eq '/div') {
+                               if ($level) {
+                                       --$level;
                                }
-                               elsif ($tag{'<'} eq '/div') {
-                                       if ($level) {
-                                               --$level;
+                               else {
+                                       # end of post <div>. Now prepare to save the file.
+                                       # path depends if it's a post or a postreply.
+                                       $mode = 'posts';
+                                       
+                                       my $postfile;
+                                       my $postpath = ARCH_PATH.$$settings{'id'}.'/';
+                                       unless (-d $postpath) {
+                                               unless (mkdir $postpath) {
+                                                       print "Can't mkdir $postpath.\n";
+                                               }
+                                       }
+                                       if(($pagemode eq 'post')and !$firstpost){
+                                               $postpath.='postreply/';
                                        }
                                        else {
-                                               $mode = 'posts';
-                                               
-                                               my $postfile;
-                                               my $postpath = ARCH_PATH.$$settings{'id'}.'/';
-                                               unless (-d $postpath) {
-                                                       unless (mkdir $postpath) {
-                                                               print "Can't mkdir $postpath.\n";
-                                                       }
-                                               }
-                                               if(($pagemode eq 'post')and !$firstpost){
-                                                       $postpath.='postreply/';
+                                               $postpath.='post/';
+                                       }
+                                       unless (-d $postpath) {
+                                               unless (mkdir $postpath) {
+                                                       print "Can't mkdir $postpath.\n";
                                                }
-                                               else {
-                                                       $postpath.='post/';
+                                       }
+                                       $postpath.=$post{'threadid'}.'/';
+                                       unless (-d $postpath) {
+                                               unless (mkdir $postpath) {
+                                                       print "Can't mkdir $postpath.\n";
                                                }
+                                       }
+                                       if(($pagemode eq 'post')and !$firstpost){
+                                               $postpath.=$post{'postid'}.'/';
                                                unless (-d $postpath) {
                                                        unless (mkdir $postpath) {
                                                                print "Can't mkdir $postpath.\n";
                                                        }
                                                }
-                                               $postpath.=$post{'threadid'}.'/';
-                                               unless (-d $postpath) {
-                                                       unless (mkdir $postpath) {
-                                                               print "Can't mkdir $postpath.\n";
+                                       }
+                                       
+                                       $postpath.=$post{'id'};
+                                       
+                                       if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
+                                               if (flock ($postfile, 2)) {
+                                                       %post2 = readdatafile($postfile);
+                                                       
+                                                       # Don't overwrite newer information with older.
+                                                       if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) {
+                                                               print ("Newer version already saved.\n\n");
                                                        }
-                                               }
-                                               if(($pagemode eq 'post')and !$firstpost){
-                                                       $postpath.=$post{'postid'}.'/';
-                                                       unless (-d $postpath) {
-                                                               unless (mkdir $postpath) {
-                                                                       print "Can't mkdir $postpath.\n";
+                                                       else {
+                                                               # delete previous information about attachments - the numbers
+                                                               # could have changed.
+                                                               foreach my $ind (keys %post2) {
+                                                                       if($ind =~ /^img(key)?-[0-9]+$/) {
+                                                                               delete $post2{$ind};
+                                                                       }
                                                                }
-                                                       }
-                                               }
-                                               
-                                               $postpath.=$post{'id'};
-                                               
-                                               if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
-                                                       if (flock ($postfile, 2)) {
-                                                               %post2 = readdatafile($postfile);
-                                                               
-                                                               if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) {
-                                                                       print ("Newer version already saved.\n\n");
+                                                               # overwrite previous information with new one
+                                                               foreach my $ind (keys %post) {
+                                                                       $post2{$ind}=$post{$ind};
                                                                }
-                                                               else {
+                                                               if ($post2{'key'} eq '') {
+                                                                       $post2{'key'} = key(KEY_BITS);
+                                                               }
+                                                               # write data to file
+                                                               if (seek($postfile, 0, 0)) {
+                                                                       writedatafile($postfile,%post2);
+                                                                       truncate ($postfile , tell($postfile));
+                                                       
                                                                        foreach my $ind (keys %post2) {
-                                                                               if($ind =~ /^img(key)?-[0-9]+$/) {
-                                                                                       delete $post2{$ind};
-                                                                               }
-                                                                       }
-                                                                       foreach my $ind (keys %post) {
-                                                                               $post2{$ind}=$post{$ind};
-                                                                       }
-                                                                       if ($post2{'key'} eq '') {
-                                                                               $post2{'key'} = key(KEY_BITS);
-                                                                       }
-                                                                       if (seek($postfile, 0, 0)) {
-                                                                               writedatafile($postfile,%post2);
-                                                                               truncate ($postfile , tell($postfile));
-                                                               
-                                                                               foreach my $ind (keys %post2) {
-                                                                                       print "$ind: $post2{$ind}\n";
-                                                                               }
-                                                                               print "saved.\n\n";
-                                                                       }
-                                                                       else {
-                                                                               print "Can't seek $postfile.\n\n";
+                                                                               print "$ind: $post2{$ind}\n";
                                                                        }
+                                                                       print "saved.\n\n";
+                                                               }
+                                                               else {
+                                                                       print "Can't seek $postfile.\n\n";
                                                                }
                                                        }
-                                                       else {
-                                                               print "Can't lock $postfile.\n\n";
-                                                       }
-                                                       close ($postfile);
                                                }
-                                               else
-                                               {
-                                                       print "Can't open $postpath.\n\n";
+                                               else {
+                                                       print "Can't lock $postfile.\n\n";
                                                }
+                                               close ($postfile);
                                        }
-                               }
-                               
-                               elsif ($tag{'<'} eq 'a') {
-                                       if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
-                                               ++$attnumber;
-                                               $post{'img-'.$attnumber}='a_'.$2;
-                                               $mode = 'post-img';
-                                       }
-                                       elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) {
-                                               $mode = 'post-replies';
+                                       else
+                                       {
+                                               print "Can't open $postpath.\n\n";
                                        }
                                }
                        }
                        
-                       elsif ($mode eq 'post-img') {
-                               if ($tag{'<'} eq 'img') {
-                                       my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'});
-                                       if ($imgkey ne '') {
-                                               $post{'imgkey-'.$attnumber}=$imgkey;
-                                       }
-                                       else {
-                                               delete $post{'img-'.$attnumber};
-                                               --$attnumber;
-                                       }
+                       elsif ($tag{'<'} eq 'a') {
+                               # there is an image attached
+                               if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+                                       ++$attnumber;
+                                       $post{'img-'.$attnumber}='a_'.$2;
+                                       $mode = 'post-img';
                                }
-                               elsif ($tag{'<'} eq '/a') {
-                                       $mode = 'post';
+                               # the number of replies may be found in one of these links.
+                               elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) {
+                                       $mode = 'post-replies';
                                }
                        }
-                       
-                       
-                       elsif ($mode eq 'post-author') {
-                               if ($tag{'<'} eq 'a') {
-                                       if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
-                                               my $author = $1;
-                                               if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
-                                                       $author = urldecode($2);
-                                               }
-                                               if ($post{'author'} eq '') {
-                                                       $post{'author'} = $author;
-                                                       $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
-                                               }
-                                       }
+               }
+               
+               # an attached image
+               elsif ($mode eq 'post-img') {
+                       if ($tag{'<'} eq 'img') {
+                               my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'});
+                               if ($imgkey ne '') {
+                                       $post{'imgkey-'.$attnumber}=$imgkey;
                                }
-                               elsif ($tag{'<'} eq '/h3') {
-                                       $mode='post';
+                               else {
+                                       delete $post{'img-'.$attnumber};
+                                       --$attnumber;
                                }
                        }
-                       
-                       elsif ($mode eq 'post-content') {
-                               if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
-                                       ++$level2;
-                                       # $post{'content'}.='<div>';
-                                       $ignoretext=1;
-                               }
-                               elsif ($tag{'<'} eq '/div') {
-                                       if($level2){
-                                               --$level2;
-                                               # $post{'content'}.='</div>';
-                                               unless($level2){
-                                                       $ignoretext=0;
-                                               }
+                       elsif ($tag{'<'} eq '/a') {
+                               $mode = 'post';
+                       }
+               }
+               
+               elsif ($mode eq 'post-author') {
+                       # name can be found in hyperlinks
+                       if ($tag{'<'} eq 'a') {
+                               # there are two types of facebook user IDs
+                               if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+                                       my $author = $1;
+                                       if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                               $author = urldecode($2);
                                        }
-                                       else {
-                                               $mode = 'post';
+                                       if ($post{'author'} eq '') {
+                                               $post{'author'} = $author;
+                                               $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
                                        }
                                }
-                               elsif ($tag{'<'} eq 'br') {
-                                       $post{'content'}.='<br>';
-                               }
-                               elsif ($tag{'<'} eq 'p') {
-                                       $post{'content'}.='<p>';
+                       }
+                       elsif ($tag{'<'} eq '/h3') {
+                               $mode='post';
+                       }
+               }
+               
+               # the post content
+               elsif ($mode eq 'post-content') {
+                       # There should not be any sub<div>s. Ignore everything inside.
+                       if ($tag{'<'} eq 'div') {
+                               ++$level2;
+                               $ignoretext=1;
+                       }
+                       elsif ($tag{'<'} eq '/div') {
+                               if($level2){
+                                       --$level2;
+                                       unless($level2){
+                                               $ignoretext=0;
+                                       }
                                }
-                               elsif ($tag{'<'} eq '/p') {
-                                       $post{'content'}.='</p>';
+                               else {
+                                       $mode = 'post';
                                }
-                               elsif (!$ignoretext) {
-                                       if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
-                                               my $imgurl = urldecode($2);
-                                               my $imgid = 'r_';
-                                               $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
-                                               
-                                               while(length($imgurl)>240) {
-                                                       $imgid.=substr($imgurl,0,120).'-/';
-                                                       $imgurl=substr($imgurl,120);
-                                               }
-                                               $imgid.=$imgurl;
+                       }
+                       elsif ($tag{'<'} eq 'br') {
+                               $post{'content'}.='<br>';
+                       }
+                       elsif ($tag{'<'} eq 'p') {
+                               $post{'content'}.='<p>';
+                       }
+                       elsif ($tag{'<'} eq '/p') {
+                               $post{'content'}.='</p>';
+                       }
+                       elsif (!$ignoretext) {
+                               if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
+                                       my $imgurl = urldecode($2);
+                                       my $imgid = 'r_';
+                                       $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
                                        
-                                               my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
-                                               if ($imgkey ne '') {
-                                                       $post{'content'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
-                                               }
-                                               else {
-                                                       $post{'content'}.='<img src="" alt="'.$tag{'alt'}.'">';
-                                               }
-                                               
-                                       }
-                                       elsif ($tag{'<'} eq 'a') {
-                                               if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
-                                                       $post{'content'}.='<a href="'.urldecode($3).'">';
-                                                       $link=1;
-                                               }
-                                               elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
-                                                       my $person = $1;
-                                                       if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
-                                                               $person = urldecode($2);
-                                                       }
-                                                       $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
-                                                       $link=1;
-                                                       $hidename=1;
-                                               }
+                                       while(length($imgurl)>240) {
+                                               $imgid.=substr($imgurl,0,120).'-/';
+                                               $imgurl=substr($imgurl,120);
                                        }
-                                       elsif ($tag{'<'} eq '/a') {
-                                               if($link) {
-                                                       $post{'content'}.='</a>';
-                                                       $link=0;
-                                                       $hidename=0;
-                                               }
+                                       $imgid.=$imgurl;
+                               
+                                       my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
+                                       if ($imgkey ne '') {
+                                               $post{'content'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
                                        }
                                        else {
-                                       # $post{'content'}.='<!'.$tag{'<'}.'!>';
+                                               $post{'content'}.='<img src="" alt="'.$tag{'alt'}.'">';
                                        }
                                }
-                               
-                       }
-                       
-                       elsif ($mode eq 'post-time') {
-                               if ($tag{'<'} eq '/abbr') {
-                                       $mode = 'post';
-                               }
-                       }
-                       
-                       elsif ($mode eq 'post-replies') {
-                               if ($tag{'<'} eq '/a') {
-                                       $mode = 'post';
+                               elsif ($tag{'<'} eq 'a') {
+                                       # a link to an external page
+                                       if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+                                               $post{'content'}.='<a href="'.urldecode($3).'">';
+                                               $link=1;
+                                       }
+                                       # a link to a user
+                                       elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
+                                               my $person = $1;
+                                               if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+                                                       $person = urldecode($2);
+                                               }
+                                               $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+                                               $link=1;
+                                               $hidename=1;
+                                       }
                                }
-                       }
-                       
-                       
-                       if ($tag{"\\"} ne '') {
-                               $closetag = 1;
-                               next;
-                       }
-                       
-                       local $/ = '<';
-                       unless (defined ($text = <$contentfile>)) {
-                               close($contentfile);
-                               return;
-                       }
-                       local $/ = "\n";
-                       $text =~ s/<$//;
-                       
-                       # # DEBUG
-                       # if ($pagetype eq 'thread') {
-                               # print ">>$mode: $text\n";
-                       # }
-                       
-                       if($mode eq 'thread-content') {
-                               unless ($ignoretext or $hidename){
-                                       $thread{'postcontent'}.=$text;
+                               elsif ($tag{'<'} eq '/a') {
+                                       if($link) {
+                                               $post{'content'}.='</a>';
+                                               $link=0;
+                                               $hidename=0;
+                                       }
                                }
-                       }
-                       elsif ($mode eq 'thread-time') {
-                               $thread{'timetext'}.=$text;
-                       }
-                       elsif ($mode eq 'thread-attachment-link-title') {
-                               $thread{'linktitle-'.$attnumber}.=$text;
-                       }
-                       elsif ($mode eq 'thread-attachment-link') {
-                               $thread{'linktext-'.$attnumber}.=$text;
-                       }
-                       elsif ($mode eq 'thread-replies') {
-                               if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) {
-                                       $thread{'replies'} = $1;
+                               else {
+                               # $post{'content'}.='<!'.$tag{'<'}.'!>';
                                }
                        }
                        
-                       if($mode eq 'post-content') {
-                               unless ($ignoretext or $hidename){
-                                       $post{'content'}.=$text;
-                               }
-                       }
-                       elsif ($mode eq 'post-time') {
-                               $post{'timetext'}.=$text;
+               }
+               
+               elsif ($mode eq 'post-time') {
+                       if ($tag{'<'} eq '/abbr') {
+                               $mode = 'post';
                        }
-                       elsif ($mode eq 'post-replies') {
-                               if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
-                                       $post{'replies'} = $1;
-                               }
+               }
+               
+               elsif ($mode eq 'post-replies') {
+                       if ($tag{'<'} eq '/a') {
+                               $mode = 'post';
                        }
-                       
                }
                
+               # dealing with the tag is finished.
                
+               # if the tag ends with "/>" chande it into an "</" tag ang go through it
+               # again from the start of the loop
+               if ($tag{"\\"} ne '') {
+                       $closetag = 1;
+                       next;
+               }
                
-               close ($contentfile);
+               # now get the text between tags
+               
+               local $/ = '<';
+               unless (defined ($text = <$contentfile>)) {
+                       close($contentfile);
+                       return;
+               }
+               local $/ = "\n";
+               $text =~ s/<$//;
+               
+               # # DEBUG
+               # if ($pagetype eq 'thread') {
+                       # print ">>$mode: $text\n";
+               # }
+               
+               # depending on state add text to relevant fields.
+               
+               if($mode eq 'thread-content') {
+                       unless ($ignoretext or $hidename){
+                               $thread{'postcontent'}.=$text;
+                       }
+               }
+               # the format facebook uses for showing time is not always helpful (for
+               # example: "2 mins ago") There is no setting to change it in facebook.
+               # at least in the m.facebook.com. The bot corrently DOES NOT interpret the
+               # text.
+               elsif ($mode eq 'thread-time') {
+                       $thread{'timetext'}.=$text;
+               }
+               elsif ($mode eq 'thread-attachment-link-title') {
+                       $thread{'linktitle-'.$attnumber}.=$text;
+               }
+               elsif ($mode eq 'thread-attachment-link') {
+                       $thread{'linktext-'.$attnumber}.=$text;
+               }
+               elsif ($mode eq 'thread-replies') {
+                       if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) {
+                               $thread{'replies'} = $1;
+                       }
+               }
+               
+               if($mode eq 'post-content') {
+                       unless ($ignoretext or $hidename){
+                               $post{'content'}.=$text;
+                       }
+               }
+               elsif ($mode eq 'post-time') {
+                       $post{'timetext'}.=$text;
+               }
+               elsif ($mode eq 'post-replies') {
+                       if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
+                               $post{'replies'} = $1;
+                       }
+               }
                
        }
+       close ($contentfile);
 }
 
 # Function to read data from datafiles.
@@ -1031,6 +1168,15 @@ sub readdatafile {
        return %data;
 }
 
+# the function to write data to datafiles (see readdatafile() description)
+#
+# First argument can be a path or a file handle. In case of a file handle it
+# will just read the file. In case of path it opens the file before writing and
+# closes after.
+#
+# On failure (file not open) returns 0.
+# On success returns 1.
+#
 sub writedatafile {
        (my $headerpath, my %header) = @_;
        my $headerfile;
@@ -1060,6 +1206,9 @@ sub writedatafile {
        return 1;
 }
 
+# Function to get a timenumber from a "date" http header field value.
+# It's a 14 digit number: 4 - year, 2 - month, 2 - day, 2 - hour, 2 - minute,
+# 2 - second.
 sub gettimenumber {
        (my $date) = @_;
        my $year;
@@ -1069,7 +1218,8 @@ sub gettimenumber {
        my $minute;
        my $second;
        
-       # see https://tools.ietf.org/html/rfc2616#section-3.3.1
+       # There are 3 possible formats.
+       # See https://tools.ietf.org/html/rfc2616#section-3.3.1
        if ($date =~ /^[A-Za-z]{3}, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/){
                $day=$1;
                $month=lc($2);
@@ -1142,6 +1292,13 @@ sub gettimenumber {
        return $year.$month.$day.$hour.$minute.$second;
 }
 
+# Function to get information about a html tag.
+# The argument is the tag text without the <>!
+# It returns a hash with the attributes' values.
+# special values:
+# '<' - the tag name (may start with '/'),
+# '/' - when the tag starts with '/',
+# '\' - when the tag ends with '/',
 sub taginfo {
        (my $tagtext) = @_;
        my %tag;
@@ -1169,15 +1326,11 @@ sub taginfo {
        if ($tagtext =~ /\/[ \t\r\n]*$/) {
                $tag{"\\"}="\\";
        }
-       # if ($tag{'<'} eq 'div') { 
-       # foreach my $ind (keys %tag) {
-                                               # print "$ind: $tag{$ind}\n";
-                                       # }
-                                       # print "\n";
-       # }
        return %tag; 
 }
 
+# Function to generate a random hexadecimal number (key) with a defined number
+# of bits.
 sub key {
        (my $bits) = @_;
        my $p = int($bits / 16);
@@ -1194,6 +1347,13 @@ sub key {
        return $keytext;
 }
 
+# The function to save an image from proxy archive
+# Arguments:
+# 1 - url of image
+# 2 - image id
+# 3 - group id
+# Returns 1 on success and 0 on failure.
+#
 sub saveimg {
        (my $url, my $id, my $groupid) = @_;
        
@@ -1214,6 +1374,7 @@ sub saveimg {
        
        print "Image $id\n";
        
+       # prepare path
        $basepath=ARCH_PATH.$groupid.'/';
        unless (-d $basepath){
                unless (mkdir $basepath) {
@@ -1229,7 +1390,6 @@ sub saveimg {
                }
        }
        
-       
        my $idtemp = $id;
        while((my $ind = index($idtemp,'/'))>=0) {
                $basepath.=substr($idtemp, 0, $ind+1);
@@ -1246,6 +1406,7 @@ sub saveimg {
        $headpath=$basepath.'@h';
        $imgpath=$basepath.'@v';
        
+       # check if image already saved
        if ( -f $imgpath) {
                if (open($headfile,"+<",$headpath)) {
                        if (flock ($headfile, 2)) {
@@ -1272,6 +1433,9 @@ sub saveimg {
        $archheadpath = $archbasepath.'@h';
        $archimgpath = $archbasepath.'@v';
        print "url: $url\n";
+       # Read the http header. Only interested in the status. If a redirection then
+       # follow it and read again. If 200 it's ok to continue processing. Otherwise
+       # return.
        for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
                %archheader = readheaderfile($archheadpath); 
                
@@ -1301,12 +1465,15 @@ sub saveimg {
                return '';
        }
        
+       # save content-type and content-disposition information.
        foreach my $ind (keys %archheader) {
                if ($ind =~ /^(content-type|content-disposition)$/) {
                        $header{$ind}=$archheader{$ind};
                }
        }
+       # generate key
        $header{'key'}=key(KEY_BITS);
+       # save image size
        if (@stat = stat($archimgpath)){
                $header{'content-length'}=$stat[7];
        }
@@ -1336,13 +1503,12 @@ sub saveimg {
                }
        }
        
-       foreach my $ind (keys %header) {
-               print $headfile "$ind: $header{$ind}\n";
-               print "$ind: $header{$ind}\n";
-       }
+       # save header
+       writedatafile($headfile,%header);
        print $headfile "\n";
        close ($headfile);
        
+       # Finally, copy the image.
        unless (copy($archimgpath,$imgpath)) {
                print "Can't copy $archimgpath.\n\n";
                return '';
@@ -1350,4 +1516,4 @@ sub saveimg {
        print "saved.\n\n";
        return $header{'key'};
        
-}
\ No newline at end of file
+}
diff --git a/config.1.txt b/config.1.txt
new file mode 100644 (file)
index 0000000..fadb842
--- /dev/null
@@ -0,0 +1,9 @@
+# config.txt is generated from config.1.txt
+# 8<----------------------------------------------------------------------------
+# Copy this to your Apache2 configuration:
+
+# 8<----------------------------------------------------------------------------
+# Copy this to your crontab:
+###BOT_CRONTAB;
+
+# 8<----------------------------------------------------------------------------
index acf692e462727f26ce4d0e123d94fb3056f7cb49..423c95d783a26e055a6e34a09b5fc76533092754 100644 (file)
@@ -1,8 +1,8 @@
 #!/usr/bin/perl
 
-# The proxy software, when run on a server, will use different directories,
-# host names, tcp ports, etc. than the server on which this software was
-# originally written.
+# The facebook interface software, when run on a server, will use different
+# directories, host names, tcp ports, etc. than the server on which this
+# software was originally written.
 # These things are defined in the file 'settings'.
 # This script is called from the makefile. It reads the settings file and
 # inserts the information in the source files.
@@ -19,13 +19,12 @@ unless (open $configfile, "<", $ARGV[0]) {
 
 # Read the config file, line format:
 # some_name = some value # some comment
-while ($line = <$configfile>) {
+while (defined(my $line = <$configfile>)) {
        $line =~ s/[\r\n]//g;
-       $line =~ s/#.*$//;
-       if ($line =~ /^ *([a-zA-Z0-9_]+) *= *(.*)$/){
-               $name=$1;
-               $value=$2;
-               $value =~ s/ *$//;
+       $line =~ s/#.*$//; #comment
+       if ($line =~ /^[ \t]*([a-zA-Z0-9_\-\.]+)[ \t]*=[ \t]*([^ \t](.*[^ \t])?)[ \t]*$/){
+               my $name=$1;
+               my $value=$2;
                $set{$name}=$value;
        }
 }
@@ -47,6 +46,8 @@ $def{'PATH'} = "\$ENV{'PATH'} = '".$set{'path'}."';";
 
 $def{'PERL'} = "#!".$set{'perl'};
 
+$def{'BOT_CRONTAB'}   = $set{'bot_crontab'}.' '.$set{'bin_path'}.'bot'.(($set{'bot_args'} ne '')?(' '.$set{'bot_args'}):'');
+
 $def{'CC'} = 'CC='.$set{'gcc'};
 $def{'CF'} = 'CF='.$set{'c_flags'};
 $def{'PL'} = 'PL='.$set{'perl'};
@@ -61,7 +62,7 @@ $def{'CM'} = 'CM='.$set{'chmod'};
 # ###SOME_NAME;
 # If found - replace.
 
-while ($line = <STDIN>) {
+while (defined($line = <STDIN>)) {
        $line =~ s/[\r\n]//g;
        if ($line =~ /###([a-zA-Z0-9_]+);/) {
                print "$def{$1}\n";
index f905b3637a51df5cfe5ca16d6bdf56b526e3c35f..f502c247a040aead3afaba220de4f8b451ea43c3 100644 (file)
@@ -9,7 +9,7 @@
 ###CM;
 ###OD;
 
-all: moveout copyout remove #config.txt
+all: moveout copyout remove config.txt
 
 
 moveout: bot   setuid exec
@@ -27,5 +27,9 @@ exec: bot
 remove:   copyout moveout setuid exec 
        # $(RM) proxy.c access.c
 
+
 bot: bot.1.pl   configure.pl settings 
        $(PL) configure.pl settings <bot.1.pl >bot
+
+config.txt: config.1.txt   configure.pl settings
+       $(PL) configure.pl settings <config.1.txt >config.txt
diff --git a/readthis.txt b/readthis.txt
new file mode 100644 (file)
index 0000000..d58a85d
--- /dev/null
@@ -0,0 +1,51 @@
+This is the facebook bot and the interface
+It depends on the proxy and some other software:
+-Apache2 (2.2) (only the interface - not written yet)
+-Perl
+-curl
+-gzip (only for compressing old log files)
+and for compilation:
+-cp
+-move
+-rm
+-chmod
+
+It might work with other versions of Apache2  It may work with other
+www server software.
+
+Recommended situation is when the software and data directories belong to a
+dedicated user account. The same user who also has the proxy.
+
+data_path, tmp_path, log_path should only be accessible by this user.
+bin_path should be publicly accessible - the programs will be called from the
+server. Some will have the SETUID bit set.
+
+To compile/install:
+
+Log in to the user account that will own the software.
+(If not, you will have to change file ownerships later.)
+Edit the file 'settings' to have values relevant to your server/computer.
+Create the directories defined there and set correct permissions and ownership.
+Run 'make.sh'. It will generate the programs and copy them to the correct
+location.
+It will also generate config.txt.
+Open this file and copy its fragments to your Apache2 config and crontab.
+#Restart Apache2. (no need for this yet)
+
+To add a facebook group:
+
+Create a file in /data_path/groupsettings. The file name is a number - the
+group id. In this file define the following settings:
+id - the group id
+hidenames - defines if the real names of people will be hidden or converted
+            "0" or "no" means no, anything else means yes. default value if
+            undefined is yes. Not implemented yet (maybe never will). Now it
+            will always hide names.
+create another file, add "-names" to the group id for the file name.
+In this file should be name mappings:
+facebook_id = filtered_name
+There MUST be a default name
+default = filtered_name.
+
+The bot accepts group IDs as commandline arguments. If there are none the bot
+will process all groups that have a config file. 
\ No newline at end of file
index 0b058acda9059ce06ab38c84be0a2e10d3ddd0aa..2cff094f5c87c5bad185989514314f552e8964c3 100644 (file)
--- a/settings
+++ b/settings
@@ -1,18 +1,23 @@
 #all directory paths must end with '/' and must already exist.
 
-bin_path  = /yplom/bin/facebug1/  
-data_path = /yplom/data/facebug1/ 
-log_path  = /yplom/log/facebug1/  
-tmp_path  = /yplom/tmp/facebug1/  
-#www_path  = /yplom/www/facebug1/  
+bin_path  = /yplom/bin/facebug1/  #Where the software will be located
+data_path = /yplom/data/facebug1/ #where the software  will remember data; subdir:
+                                  #group, groupsettings
+log_path  = /yplom/log/facebug1/  #where the software will remember data
+tmp_path  = /yplom/tmp/facebug1/  #for temporary fies
+www_path  = /yplom/www/facebug1/  #for the www server unused
 
-proxy_bin_path  = /yplom/bin/proxy/ 
-proxy_data_path = /yplom/data/proxy/
-proxy_lib_path  = /yplom/lib/proxy/
+proxy_bin_path  = /yplom/bin/proxy/  #where the proxy software is located
+proxy_data_path = /yplom/data/proxy/ #where the proxy remembers data
+proxy_lib_path  = /yplom/lib/proxy/  #where the proxy library is located 
 
-path    = /usr/local/bin:/usr/bin:/bin
+path    = /usr/local/bin:/usr/bin:/bin #The path environment variable. Must be
+                                       #overwritten if SETUID. Otherwise
+                                       #launching programs may fail. (Perl
+                                       #security...)
+
+#paths to software
 perl    = /usr/bin/perl
-curl    = /usr/bin/curl
 chmod   = /bin/chmod
 cp      = /bin/cp
 mv      = /bin/mv
@@ -31,4 +36,9 @@ logs_total        =    10 # How many old logs to keep
 rm_access_crontab =  0 1 * * * # How often to remove leftover unlock info.
 cleararch_crontab = 30 0 * * * # How often to clear the archive from old files.
 oldlogs_crontab   =  0 0 * * * # How often to deal with old logs
+bot_crontab       = 20 2 * * * # How often to run the bot
 # Not a good idea to launch oldlogs just after cleararch, I think.
+
+bot_args = 207426296087284 # facebook groups which will be processed by the bot
+                           # numbers separated by spaces. Can be left empty.
+                           # The bot will process all groups then.