###PERL;
+# bot is generated from bot.1.pl
+#
+# This is the facebook bot. It depends on the proxy.
+# It reads pages from m.facebook.com archived on the proxy, extracts threads,
+# posts, images, etc. from a facebook group and saves them.
+
use strict;
use Fcntl;
use File::Copy;
srand ($time-$$);
print strftime("%d.%m.%Y %H:%M:%S", gmtime($time))."\n";
+# If there are commandline arguments the bot will only process the facebook
+# group(s) defined by them. The arguments are numbers which are the group's ID
+# and also the name of the config file.
if (scalar @ARGV) {
for (my $ind=0; $ind < scalar @ARGV; ++$ind) {
processgroup($ARGV[$ind]);
}
}
+# If there were no commandline arguments the bot will process all groups which
+# have a config file defined.
else {
my $dir;
my $subpath;
}
}
+# The function to process one facebook group. The argument is a number which is
+# the group's ID and also the config file name.
sub processgroup {
(my $filenumber) = @_;
my $settingspath;
my %names;
my $archpath;
+ # ID must be a number
if ($filenumber =~ /^([0-9]+)$/) {
$settingspath = GROUPSETTINGS_PATH.$1;
}
return;
}
+ # read the configuration file
%settings = readconfigfile($settingspath);
+ # The settings defined so far:
+ # id - the group's ID. SHOULD be the same as the file name.
+ # hidenames - defines if the real names of people will be hidden or converted
+ # "0" or "no" means no, anything else means yes. default value if
+ # undefined is yes. Not implemented yet (maybe never will). Now it
+ # will always hide names.
- #The group id SHOULD be the filename. But what if it isn't?
if ($settings{'id'} =~ /^([0-9]+)$/) {
$groupid=$1;
}
return;
}
+ #read the list of names. A default name MUST be defined.
$namespath = GROUPSETTINGS_PATH.$groupid.'-names';
%names = readconfigfile($namespath);
if ($names{'default'} eq '') {
}
print "Group $groupid\n";
-
+ # process the pages with threads:
$archpath = url2path('https://m.facebook.com/groups/'.$groupid);
processdir($archpath.'@q/',\%settings,\%names,0);
processdir($archpath.'/@q/',\%settings,\%names,0);
-
+ # process the pages with posts:
$archpath = url2path('https://m.facebook.com/comment/replies');
processdir($archpath.'@q/',\%settings,\%names,1);
processdir($archpath.'/@q/',\%settings,\%names,1);
}
+# The function to process all files in a directory and (recursively) all
+# subdirectories.
+# The first argument is the directory path (ending with "/").
+# The three other arguments are passed to processfile().
sub processdir {
(my $dirpath, my $settings, my $names, my $pagemode) = @_;
my $dir;
}
while (defined($subpath = readdir $dir)) {
$subpathfull=$dirpath.$subpath;
+ # "." or ".." should be ignored
if ($subpath =~ /^\.\.?$/) {
next;
}
processdir($subpathfull.'/', $settings, $names, $pagemode);
}
}
- closedir ($dir);
+ closedir ($dir);
}
+# The function to process one archived page.
+# Arguments:
+# 1. The path to the archived header. Ends with "@h". Otherwise function just
+# returns without doing anything.
+# 2. reference to the hash with group settings
+# 3. reference to the hash with names
+# 4. mode, determines how pages are interpreted. If 0 page type can be
+# 'thread' - one thread
+# 'group' - list of threads' firstposts
+# If nonzero page type can be
+# 'post' - one post (with replies)
+#
sub processfile {
(my $headerpath, my $settings, my $names, my $pagemode) = @_;
my $basepath;
my %cgi;
- my $postid;
- my $threadid;
- my $groupid=0; ###!
- my $timenumber;
+ my $postid; #id of post
+ my $threadid; #id of thread
+ my $groupid=0; #determines if group id was found on page. REAL group id is in $$settings{'id'}!
+ my $timenumber; #number, determines when page was saved on the proxy.
- my %thread;
- my %thread2;
+ my %thread; #thread info, created by interpreting the page
+ my %thread2; #thread info, created by reading previously archived file
- my %post;
- my %post2;
+ my %post; #post info, created by interpreting the page
+ my %post2; #post info, created by reading previously archived file
- my $pagetype;
+ my $pagetype; #type of page: 'group', 'thread' or 'post'
+ # Argument must be HEADER path.
if ($headerpath =~ /^((.+)\@h)$/) {
$headerpath = $1;
$basepath = $2;
($prot, $host, $port, $path, $query) = path2urldiv($basepath);
print 'Page '.joinurl($prot, $host, $port, $path, $query)."\n";
- ### REDESIGN THE CONDITIONS!
+
+ #Determine what type of page it is. If none ot the three types - return.
if($query ne '') {
%cgi=getcgi($query);
if($pagemode) {
$pagetype = 'group';
}
}
- print " type=$pagetype\n";
+ print "type=$pagetype\n";
+ # Read the http header. Only interested in the status. If a redirection then
+ # follow it and read again. If 200 it's ok to continue processing. Otherwise
+ # return.
for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
%header = readheaderfile($headerpath);
- if ($header{'status'} =~ /^200 /) {
- last;
- }
- elsif ($header{'status'} =~ /^30[1237] /) {
+ if ($header{'status'} =~ /^30[1237] /) {
my $location;
unless (defined($location = $header{'location'})) {
return;
$contentpath = $basepath.'@v';
}
else {
- return;
+ last;
}
}
+ if ($header{'status'} !~ /^200 /) {
+ return;
+ }
+ # find out when the page was archived on the proxy.
unless ($timenumber = gettimenumber ($header{'date'})) {
my @stat;
unless (@stat = stat $contentpath) {
}
}
- # This condition is redundant now.
- if ($pagetype) {
+ # Now the bot can start reading the file
+
+ unless (open ($contentfile, "<",$contentpath)) {
+ print "Can't open $contentpath.\n";
+ return;
+ }
+
+ my $text; # piece of text read from the file
+ my %tag; # html tag
+ my $mode; # the state of the main state machine
+ my $level; # keeps track in how many <div> levels the bot went
+ my $level2; # same as above used in different state (previous must be kept)
+ my $closetag=0;# if there is a tag to close
+ my $ignoretext;# if text should be ignored and not added to post/thread content
+ my $link; # if bot is inside a link
+ my $hidename; # if bot is inside a part which contains a name to be hidden
+ my $attnumber; # number of current attachment
+ my $incomplete;# if thread firstpost's content is incomplete
+ my $firstpost; # if the bot is in the firstpost (important if pagetype='post')
+
+ # Set initial values depending on page type.
+ if ($pagetype eq 'thread') {
+ print "Thread $threadid\n";
+ $thread{'id'}=$threadid;
+ $thread{'groupid'}=$$settings{'id'};
+ $thread{'timenumber'}=$timenumber;
+ $mode = 'thread';
+ $level=0;
+ $attnumber=0;
+ $incomplete=0;
+ }
+ elsif ($pagetype eq 'post'){
+ print "Post $postid ($threadid)\n";
- unless (open ($contentfile, "<",$contentpath)) {
- print "Can't open $contentpath.\n";
- return;
+ $mode='posts';
+ $firstpost=1;
+ }
+ else { #group
+ print "Threads\n";
+ $mode = 'threads';
+ }
+
+ my $line;
+
+ # The main loop. It reads the file like this:
+ # <tag>text<tag>text<tag>text...
+ # If there two tags next to each other <tag1><tag2> there still is text
+ # between them, only with zero length.
+ # It the tag looks like this:
+ # <tag name1="value1" />
+ # the bot will interpret it as
+ # <tag name1="value1"></tag>
+ # without the 0-length text between.
+ #
+ # This is a state machine
+ # In the main loop the tag is read and depending on the tag's content, state
+ # and some variable values some actions can be taken.
+ # Then the text is read and again depending ot the text, state and some
+ # variable values some actions can be taken.
+ # This loop continues until the end of file.
+
+ #read and ignore text before the first tag.
+ local $/ = '<';
+ unless (defined ($text = <$contentfile>)) {
+ close($contentfile);
+ return;
+ }
+ # main loop
+ while ($mode ne '') {
+ # if there was a tag ending with "/>" in the previous iteration then in this
+ # one it will be treated as the same tag but with "</".
+ if ($closetag){
+ $tag{'<'} = '/'.$tag{'<'};
+ $tag{'/'}='/';
+ delete $tag{"\\"};
+ $closetag=0;
}
- my $text;
- my %tag;
- my $mode;
- my $level;
- my $level2;
- my $closetag=0;
- my $ignoretext;
- my $link;
- my $hidename;
- my $attnumber;
- my $incomplete;
- my $firstpost;
-
- if ($pagetype eq 'thread') {
- print "Thread $threadid\n";
-
- $thread{'id'}=$threadid;
- $thread{'groupid'}=$$settings{'id'};
- $thread{'timenumber'}=$timenumber;
- $mode = 'thread';
- $level=0;
- $attnumber=0;
- $incomplete=0;
- }
- elsif ($pagetype eq 'post'){
- print "Post $postid ($threadid)\n";
+ # otherwise read next tag
+ else {
+ local $/ = '>';
+ unless (defined ($text = <$contentfile>)) {
+ close($contentfile);
+ return;
+ }
+ $text =~ s/>$//;
+ # # DEBUG:
+ # if($pagetype eq 'thread'){
+ # print ">>$mode: <$text>\n";
+ # }
- $mode='posts';
- $firstpost=1;
- }
- else { #group
- print "Threads\n";
- $mode = 'threads';
+ # get the attributes of the tag
+ # special values: '<' - tag name (with or without '/'), '/' - when it's a
+ # closing tag, '\' when tag ends with "/>".
+ %tag = taginfo($text);
}
+ local $/ = "\n";
- my $line;
-
- local $/ = '<';
- unless (defined ($text = <$contentfile>)) {
- close($contentfile);
- return;
+ # List of threads. Look for <div>s with threads
+ if ($mode eq 'threads'){
+ # Thread found. Id not known yet!
+ if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) {
+ print "Thread [$1]\n";
+ $mode = 'thread';
+ # set initial values
+ %thread = ();
+ $thread{'groupid'}=$$settings{'id'};
+ $thread{'timenumber2'}=$timenumber;
+ $level = 0;
+ $attnumber=0;
+ $incomplete=0;
+ }
}
- while ($mode ne '') {
- if ($closetag){
- $tag{'<'} = '/'.$tag{'<'};
- $tag{'/'}='/';
- delete $tag{"\\"};
- $closetag=0;
+
+ # one thread
+ elsif ($mode eq 'thread'){
+ # thread author is in first <h3>
+ if ($tag{'<'} eq 'h3') {
+ $mode = 'thread-author';
}
- else {
- local $/ = '>';
- unless (defined ($text = <$contentfile>)) {
- close($contentfile);
- return;
+ elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
+ # Post content is (always?) in the first <div> with a 2 letter class name after author
+ if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) {
+ $mode='thread-content';
+ $level2=0;
+ $ignoretext=1; # text in firstposts only inside <p>
+ $hidename=0;
+ $link=0;
+ }
+ else {
+ ++$level;
}
- $text =~ s/>$//;
- # # DEBUG:
- # if($pagetype eq 'thread'){
- # print ">>$mode: <$text>\n";
- # }
- %tag = taginfo($text);
}
- local $/ = "\n";
+ elsif (($tag{'<'} eq '/div') and $level) {
+ --$level;
+ }
- if ($mode eq 'threads'){
- if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) {
- print "Thread [$1]\n";
- $mode = 'thread';
- %thread = ();
- $thread{'groupid'}=$$settings{'id'};
- $thread{'timenumber2'}=$timenumber;
- $level = 0;
- $attnumber=0;
- $incomplete=0;
- }
+ # the time text is in the only <abbr> in thread firstpost
+ elsif ($tag{'<'} eq 'abbr') {
+ $mode = 'thread-time';
}
- elsif ($mode eq 'thread'){
- # print "+++$text+++\n";
- if ($tag{'<'} eq 'h3') {
- $mode = 'thread-author';
+ elsif ($tag{'<'} eq 'a') {
+ # there is an image attached
+ if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+ ++$attnumber;
+ $thread{'img-'.$attnumber}='a_'.$2;
+ $mode = 'thread-attachment-img';
}
- elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
- # These are very 'helpful' class names, facebug, thank you!
- # After recent changes do I still need the unreliable class name?
- # Is post content always in first <div> after author with a 2 letter class name?
- # Let's test:
- if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) {
- # if (($tag{'class'} =~ /^(bj|bk|bm)|(db|da)$/) and (!defined($thread{'postcontent'}))) {
- $mode='thread-content';
- $level2=0;
- $ignoretext=1;
- $hidename=0;
- $link=0;
- }
- # elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)|(dc|db)$/)) { ### NAMES NOT RELIABLE! HAVE TO IMPROVE SERIOUSLY!
- # $mode='thread-attachment';
- # $level2=0;
- # $attnumber=0;
- # }
- else {
- ++$level;
+ # there is a link attached
+ elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+ ++$attnumber;
+ $thread{'link-'.$attnumber}=urldecode($3);
+ $mode = 'thread-attachment-link';
+ }
+ # if thread id is not known it can be determined from this link.
+ # also, the number of replies may be found in one of these links.
+ elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) {
+ if ($thread{'id'} eq '') {
+ $thread{'id'} = $2;
+ print "Thread $thread{'id'}\n";
}
+ $mode = 'thread-replies';
}
- elsif (($tag{'<'} eq '/div') and $level) {
- --$level;
+ }
+
+ # Depending on page mode the thread and firstpost information can end in
+ # two different ways. When page type is 'thread' the end is at the <div>
+ # whose id starts with "ufi_". When page type is 'group' it ends when
+ # leaving the thread-related div
+ #
+ elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) {
+ # depending on page type the rest of the page contains posts or other
+ # threads.
+ if ($pagetype eq 'thread') {
+ $mode='posts';
}
-
- elsif ($tag{'<'} eq 'abbr') {
- $mode = 'thread-time';
+ else {
+ $mode='threads';
}
- elsif ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
- ++$attnumber;
- $thread{'img-'.$attnumber}='a_'.$2;
- $mode = 'thread-attachment-img';
- }
- elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
- ++$attnumber;
- $thread{'link-'.$attnumber}=urldecode($3);
- $mode = 'thread-attachment-link';
+ # Now prepare to save the file
+ my $threadfile;
+ my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
+ unless (-d $threadpath) {
+ unless (mkdir $threadpath) {
+ print "Can't mkdir $threadpath.\n";
}
- elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) {
- if ($thread{'id'} eq '') {
- $thread{'id'} = $2;
- print "Thread $thread{'id'}\n";
- }
- $mode = 'thread-replies';
+ }
+ $threadpath.='thread/';
+ unless (-d $threadpath) {
+ unless (mkdir $threadpath) {
+ print "Can't mkdir $threadpath.\n";
}
}
+ $threadpath.=$thread{'id'};
- elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) {
- if ($pagetype eq 'thread') {
- $mode='posts';
- }
- else {
- $mode='threads';
- }
-
- my $threadfile;
- my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
- unless (-d $threadpath) {
- unless (mkdir $threadpath) {
- print "Can't mkdir $threadpath.\n";
- }
- }
- $threadpath.='thread/';
- unless (-d $threadpath) {
- unless (mkdir $threadpath) {
- print "Can't mkdir $threadpath.\n";
+ if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) {
+ if (flock ($threadfile, 2)) {
+ # read the data already saved in file
+ %thread2 = readdatafile($threadfile);
+
+ # in 'threads' page type the firstpost's content may be incomplete.
+ # in that case it should not be written to file if there is already
+ # one. Even if the file has an older version.
+ # but in this page type there is information about the number of
+ # replies, not found in 'thread' page type. This information should
+ # be written even when the post contnet shouldn't.
+ #
+ # That's why a thread has the timenumber and timenumber2.
+ # timenumber defines the time of the post content. in pagetype
+ # 'thread' only timenumber is checked and only timenumber is
+ # updated.
+ # timenumber2 defines the time of information only available in the
+ # 'group' page type. In this pagetype the post content is only
+ # updated if complete and timenumber allows it. Other information is
+ # updated if timenumber2 allows it
+
+ # Don't overwrite newer information with older.
+ if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) {
+ print ("Newer version already saved.\n\n");
}
- }
- $threadpath.=$thread{'id'};
-
- # if (open ($threadfile, "+<", $threadpath)) {
- if (sysopen ($threadfile, $threadpath, O_RDWR | O_CREAT)) {
- if (flock ($threadfile, 2)) {
- %thread2 = readdatafile($threadfile);
-
- if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) {
- print ("Newer version already saved.\n\n");
- }
- else {
- if($pagetype ne 'thread'){
- if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) {
- print ("Newer version of post content already saved.\n");
- delete $thread{'postcontent'};
- }
- elsif($incomplete) {
- print ("Post content incomplete.\n");
- if(defined($thread2{'postcontent'})){
- delete $thread{'postcontent'};
- }
- }
- else {
- $thread{'timenumber'}=$thread{'timenumber2'};
- }
+ else {
+ if($pagetype ne 'thread'){
+ # Don't overwrite newer post content with older.
+ if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) {
+ print ("Newer version of post content already saved.\n");
+ delete $thread{'postcontent'};
}
- foreach my $ind (keys %thread2) {
- if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
- delete $thread2{$ind};
+ elsif($incomplete) {
+ # Don't overwrite complete post content with incomplete one.
+ # Write incomplete content if nothing was archived before,
+ # better this than nothing.
+ print ("Post content incomplete.\n");
+ if(defined($thread2{'postcontent'})){
+ delete $thread{'postcontent'};
}
}
- foreach my $ind (keys %thread) {
- $thread2{$ind}=$thread{$ind};
+ else {
+ $thread{'timenumber'}=$thread{'timenumber2'};
}
- if ($thread2{'key'} eq '') {
- $thread2{'key'} = key(KEY_BITS);
+ }
+ # delete previous information about attachments - the numbers
+ # could have changed.
+ foreach my $ind (keys %thread2) {
+ if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
+ delete $thread2{$ind};
}
+ }
+
+ # overwrite previous information with new one
+ foreach my $ind (keys %thread) {
+ $thread2{$ind}=$thread{$ind};
+ }
+ if ($thread2{'key'} eq '') {
+ $thread2{'key'} = key(KEY_BITS);
+ }
+
+ # write data to file
+ if (seek($threadfile, 0, 0)) {
+ writedatafile($threadfile,%thread2);
+ truncate ($threadfile , tell($threadfile));
- if (seek($threadfile, 0, 0)) {
- writedatafile($threadfile,%thread2);
- truncate ($threadfile , tell($threadfile));
-
- foreach my $ind (keys %thread2) {
- print "$ind: $thread2{$ind}\n"; ####
- }
- print "saved.\n\n";
- }
- else {
- print "Can't seek $threadfile.\n\n";
+ foreach my $ind (keys %thread2) {
+ print "$ind: $thread2{$ind}\n"; ####
}
+ print "saved.\n\n";
+ }
+ else {
+ print "Can't seek $threadfile.\n\n";
}
- }
- else {
- print "Can't lock $threadfile.\n\n";
- }
- close ($threadfile);
- }
- else
- {
- print "Can't open $threadpath.\n\n";
- }
- }
- }
-
- elsif ($mode eq 'thread-author') {
- if ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
- my $author = $1;
- if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
- $author = urldecode($2);
- }
- if ($thread{'author'} eq '') {
- $thread{'author'} = $author;
- $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
- }
- }
- }
- elsif ($tag{'<'} eq '/h3') {
- $mode='thread';
- }
- }
-
- elsif ($mode eq 'thread-content') {
- if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
- ++$level2;
- # $thread{'postcontent'}.='<div>';
- $ignoretext=1;
- }
- elsif ($tag{'<'} eq '/div') {
- if($level2){
- --$level2;
- # $thread{'postcontent'}.='</div>';
- unless($level2) {
- $ignoretext=0;
}
}
else {
- $mode = 'thread';
+ print "Can't lock $threadfile.\n\n";
}
+ close ($threadfile);
}
- elsif ($tag{'<'} eq 'br') {
- $thread{'postcontent'}.='<br>';
- $ignoretext=0;
+ else
+ {
+ print "Can't open $threadpath.\n\n";
}
- elsif ($tag{'<'} eq 'p') {
- $thread{'postcontent'}.='<p>';
- $ignoretext=0;
- }
- elsif ($tag{'<'} eq '/p') {
- $thread{'postcontent'}.='</p>';
- $ignoretext=1;
- }
- elsif (!$ignoretext) {
- if ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
- $thread{'postcontent'}.='<a href="'.urldecode($3).'">';
- $link=1;
- }
- elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
- my $person=$1;
- if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
- $person = urldecode($2);
- }
- $thread{'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
- $link=1;
- $hidename=1;
- }
- }
- elsif ($tag{'<'} eq '/a') {
- if($link) {
- $thread{'postcontent'}.='</a>';
- $link=0;
- $hidename=0;
- }
- }
- else {
- # $thread{'postcontent'}.='<!'.$tag{'<'}.'!>';
+ }
+ }
+
+ # author name
+ elsif ($mode eq 'thread-author') {
+ # name can be found in hyperlinks
+ if ($tag{'<'} eq 'a') {
+ # there are two types of facebook user IDs
+ if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+ my $author = $1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $author = urldecode($2);
}
- }
- elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) {
- unless($incomplete) {
- $thread{'postcontent'}.='<p><b>Post not completely archived.</b></p>';
+ if ($thread{'author'} eq '') {
+ $thread{'author'} = $author;
+ $thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
}
- $incomplete=1;
}
}
-
- elsif ($mode eq 'thread-time') {
- if ($tag{'<'} eq '/abbr') {
- $mode = 'thread';
- }
+ # go out of <h3>
+ elsif ($tag{'<'} eq '/h3') {
+ $mode='thread';
}
-
-
- # elsif ($mode eq 'thread-attachment') {
- # if ($tag{'<'} eq 'div') {
- # ++$level2;
- # }
- # elsif ($tag{'<'} eq '/div') {
- # if($level2){
- # --$level2;
- # }
- # else {
- # $mode = 'thread';
- # }
- # }
- # elsif ($tag{'<'} eq 'a') {
- # if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
- # ++$attnumber;
- # $thread{'img-'.$attnumber}='a_'.$2;
- # $mode = 'thread-attachment-img';
- # }
- # elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
- # ++$attnumber;
- # $thread{'link-'.$attnumber}=urldecode($3);
- # $mode = 'thread-attachment-link';
- # }
- # }
- # }
-
- elsif ($mode eq 'thread-attachment-img') {
- if ($tag{'<'} eq 'img') {
- my $imgkey = saveimg($tag{'src'},$thread{'img-'.$attnumber},$$settings{'id'});
- if ($imgkey ne '') {
- $thread{'imgkey-'.$attnumber}=$imgkey;
- }
- else {
- delete $thread{'img-'.$attnumber};
- --$attnumber;
+ }
+
+ # the firstpost's content
+ elsif ($mode eq 'thread-content') {
+ # There should not be any sub<div>s. Ignore everything inside.
+ if ($tag{'<'} eq 'div') {
+ ++$level2;
+ $ignoretext=1;
+ }
+ elsif ($tag{'<'} eq '/div') {
+ if($level2){
+ --$level2;
+ unless($level2) {
+ $ignoretext=0;
}
}
- elsif ($tag{'<'} eq '/a') {
+ else {
$mode = 'thread';
}
}
-
- elsif ($mode eq 'thread-attachment-link') {
- if($tag{'<'} eq 'h3'){
- $mode = 'thread-attachment-link-title';
- }
- elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) {
- my $imgurl = urldecode($3);
- my $imgid='i_';
+ elsif ($tag{'<'} eq 'br') {
+ $thread{'postcontent'}.='<br>';
+ $ignoretext=0;
+ }
+ elsif ($tag{'<'} eq 'p') {
+ $thread{'postcontent'}.='<p>';
+ $ignoretext=0;
+ }
+ elsif ($tag{'<'} eq '/p') {
+ $thread{'postcontent'}.='</p>';
+ $ignoretext=1;
+ }
+ elsif (!$ignoretext) {
+ # inline image (smiley?)
+ if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
+ my $imgurl = urldecode($2);
+ my $imgid = 'r_';
$imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
while(length($imgurl)>240) {
$imgurl=substr($imgurl,120);
}
$imgid.=$imgurl;
+
my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
if ($imgkey ne '') {
- $thread{'img-'.$attnumber}=$imgid;
- $thread{'imgkey-'.$attnumber}=$imgkey;
+ $thread{'postcontent'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
+ }
+ else {
+ $thread{'postcontent'}.='<img src="" alt="'.$tag{'alt'}.'">';
+ }
+ }
+ elsif ($tag{'<'} eq 'a') {
+ # a link to an external page
+ if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+ $thread{'postcontent'}.='<a href="'.urldecode($3).'">';
+ $link=1;
+ }
+ # a link to a user
+ elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
+ my $person=$1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $person = urldecode($2);
+ }
+ $thread{'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $link=1;
+ $hidename=1;
}
}
elsif ($tag{'<'} eq '/a') {
- $mode = 'thread';
+ if($link) {
+ $thread{'postcontent'}.='</a>';
+ $link=0;
+ $hidename=0;
+ }
+ }
+ else {
+ # $thread{'postcontent'}.='<!'.$tag{'<'}.'!>';
}
}
-
- elsif ($mode eq 'thread-attachment-link-title') {
- if ($tag{'<'} eq '/h3') {
- $mode = 'thread-attachment-link';
+ # a link for "more..." outside the <p>s = past incomplete!
+ elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) {
+ unless($incomplete) {
+ $thread{'postcontent'}.='<p><b>Post not completely archived.</b></p>';
}
+ $incomplete=1;
}
-
- elsif ($mode eq 'thread-replies') {
- if ($tag{'<'} eq '/a') {
- $mode = 'thread';
+ }
+
+ # time
+ elsif ($mode eq 'thread-time') {
+ if ($tag{'<'} eq '/abbr') {
+ $mode = 'thread';
+ }
+ }
+
+ # an attached image
+ elsif ($mode eq 'thread-attachment-img') {
+ if ($tag{'<'} eq 'img') {
+ my $imgkey = saveimg($tag{'src'},$thread{'img-'.$attnumber},$$settings{'id'});
+ if ($imgkey ne '') {
+ $thread{'imgkey-'.$attnumber}=$imgkey;
+ }
+ else {
+ delete $thread{'img-'.$attnumber};
+ --$attnumber;
}
}
-
-
- elsif ($mode eq 'posts') {
- if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
- %post = ();
-
- $post{'id'} = $1;
- $post{'threadid'} = $threadid;
- $post{'groupid'} = $$settings{'id'};
- $post{'timenumber'} = $timenumber;
-
- $mode = 'post';
- $level=0;
- $attnumber=0;
-
- if($pagetype eq 'post') {
- if(!$groupid) {
- print "Can't determine if post belongs to group $$settings{'id'}.\n";
- $mode='';
- last;
- }
- elsif($post{'id'} eq $postid) {
- $firstpost=1;
- }
- else {
- $firstpost=0;
- $post{'postid'} = $postid;
- }
- }
- print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n";
+ elsif ($tag{'<'} eq '/a') {
+ $mode = 'thread';
+ }
+ }
+
+ #an attached link
+ elsif ($mode eq 'thread-attachment-link') {
+ #"title" is found in <h3>
+ if($tag{'<'} eq 'h3'){
+ $mode = 'thread-attachment-link-title';
+ }
+ # an image included to illustrate the attached link. Facebook automatically chooses it...
+ elsif (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/safe_image\.php\?(.*&)?url=([^&]+)(&.*)?$/)) {
+ my $imgurl = urldecode($3);
+ my $imgid='i_';
+ $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
+
+ while(length($imgurl)>240) {
+ $imgid.=substr($imgurl,0,120).'-/';
+ $imgurl=substr($imgurl,120);
+ }
+ $imgid.=$imgurl;
+ my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
+ if ($imgkey ne '') {
+ $thread{'img-'.$attnumber}=$imgid;
+ $thread{'imgkey-'.$attnumber}=$imgkey;
}
- elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) {
- if ($1 eq $$settings{'id'}) {
- $groupid = 1;
+ }
+ elsif ($tag{'<'} eq '/a') {
+ $mode = 'thread';
+ }
+ }
+
+ elsif ($mode eq 'thread-attachment-link-title') {
+ if ($tag{'<'} eq '/h3') {
+ $mode = 'thread-attachment-link';
+ }
+ }
+
+ elsif ($mode eq 'thread-replies') {
+ if ($tag{'<'} eq '/a') {
+ $mode = 'thread';
+ }
+ }
+
+ # list of posts. look for <div>s with posts.
+ elsif ($mode eq 'posts') {
+ # post found
+ if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
+ %post = ();
+
+ $post{'id'} = $1;
+ $post{'threadid'} = $threadid;
+ $post{'groupid'} = $$settings{'id'};
+ $post{'timenumber'} = $timenumber;
+
+ $mode = 'post';
+ $level=0;
+ $attnumber=0;
+
+ # if page type is 'post' the URL is not enough to see which group the
+ # posts belongs to. has to be found on the page BEFORE entering first
+ # post <div>!
+ if($pagetype eq 'post') {
+ if(!$groupid) {
+ print "Can't determine if post belongs to group $$settings{'id'}.\n";
+ $mode='';
+ last;
+ }
+ elsif($post{'id'} eq $postid) {
+ # In this page type it's important to check if this post is a reply
+ # to a thread or to a post.
+ $firstpost=1;
}
else {
- print "Post does not belong to group $$settings{'id'}.\n";
- $mode = '';
- last;
+ $firstpost=0;
+ $post{'postid'} = $postid;
}
}
+ print "Post ".((($pagetype eq 'post') and !$firstpost)?"$post{'postid'}/":"")."$post{'id'}\n";
}
-
- elsif ($mode eq 'post') {
- if ($tag {'<'} eq 'h3') {
- $mode = 'post-author';
+ # Link can be used to determine which group the post belongs to.
+ elsif (($tag{'<'} eq 'a') and ($pagetype eq 'post') and ($tag{'href'} =~ /^\/groups\/([0-9]+)\/?\?/)) {
+ if ($1 eq $$settings{'id'}) {
+ $groupid = 1;
}
- elsif ($tag{'<'} eq 'abbr') {
- $mode = 'post-time';
+ else {
+ print "Post does not belong to group $$settings{'id'}.\n";
+ $mode = '';
+ last;
}
- if ($tag{'<'} eq 'div') {
- if(($tag{'class'} eq '')and(!defined($post{'content'}))) {
- $mode = 'post-content';
- $level2=0;
- $ignoretext=0;
- $hidename=0;
- $link=0;
- }
- else {
- ++$level;
- }
+ }
+ }
+
+ # one post
+ elsif ($mode eq 'post') {
+ # thread author is in first <h3>
+ if ($tag {'<'} eq 'h3') {
+ $mode = 'post-author';
+ }
+ elsif ($tag{'<'} eq 'abbr') {
+ $mode = 'post-time';
+ }
+ if ($tag{'<'} eq 'div') {
+ # Post content is (always?) in the first <div> without a class name after author
+ if(($tag{'class'} eq '')and(!defined($post{'content'}))) {
+ $mode = 'post-content';
+ $level2=0;
+ $ignoretext=0;
+ $hidename=0;
+ $link=0;
+ }
+ else {
+ ++$level;
+ }
+ }
+ elsif ($tag{'<'} eq '/div') {
+ if ($level) {
+ --$level;
}
- elsif ($tag{'<'} eq '/div') {
- if ($level) {
- --$level;
+ else {
+ # end of post <div>. Now prepare to save the file.
+ # path depends if it's a post or a postreply.
+ $mode = 'posts';
+
+ my $postfile;
+ my $postpath = ARCH_PATH.$$settings{'id'}.'/';
+ unless (-d $postpath) {
+ unless (mkdir $postpath) {
+ print "Can't mkdir $postpath.\n";
+ }
+ }
+ if(($pagemode eq 'post')and !$firstpost){
+ $postpath.='postreply/';
}
else {
- $mode = 'posts';
-
- my $postfile;
- my $postpath = ARCH_PATH.$$settings{'id'}.'/';
- unless (-d $postpath) {
- unless (mkdir $postpath) {
- print "Can't mkdir $postpath.\n";
- }
- }
- if(($pagemode eq 'post')and !$firstpost){
- $postpath.='postreply/';
+ $postpath.='post/';
+ }
+ unless (-d $postpath) {
+ unless (mkdir $postpath) {
+ print "Can't mkdir $postpath.\n";
}
- else {
- $postpath.='post/';
+ }
+ $postpath.=$post{'threadid'}.'/';
+ unless (-d $postpath) {
+ unless (mkdir $postpath) {
+ print "Can't mkdir $postpath.\n";
}
+ }
+ if(($pagemode eq 'post')and !$firstpost){
+ $postpath.=$post{'postid'}.'/';
unless (-d $postpath) {
unless (mkdir $postpath) {
print "Can't mkdir $postpath.\n";
}
}
- $postpath.=$post{'threadid'}.'/';
- unless (-d $postpath) {
- unless (mkdir $postpath) {
- print "Can't mkdir $postpath.\n";
+ }
+
+ $postpath.=$post{'id'};
+
+ if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
+ if (flock ($postfile, 2)) {
+ %post2 = readdatafile($postfile);
+
+ # Don't overwrite newer information with older.
+ if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) {
+ print ("Newer version already saved.\n\n");
}
- }
- if(($pagemode eq 'post')and !$firstpost){
- $postpath.=$post{'postid'}.'/';
- unless (-d $postpath) {
- unless (mkdir $postpath) {
- print "Can't mkdir $postpath.\n";
+ else {
+ # delete previous information about attachments - the numbers
+ # could have changed.
+ foreach my $ind (keys %post2) {
+ if($ind =~ /^img(key)?-[0-9]+$/) {
+ delete $post2{$ind};
+ }
}
- }
- }
-
- $postpath.=$post{'id'};
-
- if (sysopen ($postfile, $postpath, O_RDWR | O_CREAT)) {
- if (flock ($postfile, 2)) {
- %post2 = readdatafile($postfile);
-
- if (($post2{'timenumber'} ne '')and($post2{'timenumber'}>$post{'timenumber'})) {
- print ("Newer version already saved.\n\n");
+ # overwrite previous information with new one
+ foreach my $ind (keys %post) {
+ $post2{$ind}=$post{$ind};
}
- else {
+ if ($post2{'key'} eq '') {
+ $post2{'key'} = key(KEY_BITS);
+ }
+ # write data to file
+ if (seek($postfile, 0, 0)) {
+ writedatafile($postfile,%post2);
+ truncate ($postfile , tell($postfile));
+
foreach my $ind (keys %post2) {
- if($ind =~ /^img(key)?-[0-9]+$/) {
- delete $post2{$ind};
- }
- }
- foreach my $ind (keys %post) {
- $post2{$ind}=$post{$ind};
- }
- if ($post2{'key'} eq '') {
- $post2{'key'} = key(KEY_BITS);
- }
- if (seek($postfile, 0, 0)) {
- writedatafile($postfile,%post2);
- truncate ($postfile , tell($postfile));
-
- foreach my $ind (keys %post2) {
- print "$ind: $post2{$ind}\n";
- }
- print "saved.\n\n";
- }
- else {
- print "Can't seek $postfile.\n\n";
+ print "$ind: $post2{$ind}\n";
}
+ print "saved.\n\n";
+ }
+ else {
+ print "Can't seek $postfile.\n\n";
}
}
- else {
- print "Can't lock $postfile.\n\n";
- }
- close ($postfile);
}
- else
- {
- print "Can't open $postpath.\n\n";
+ else {
+ print "Can't lock $postfile.\n\n";
}
+ close ($postfile);
}
- }
-
- elsif ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
- ++$attnumber;
- $post{'img-'.$attnumber}='a_'.$2;
- $mode = 'post-img';
- }
- elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) {
- $mode = 'post-replies';
+ else
+ {
+ print "Can't open $postpath.\n\n";
}
}
}
- elsif ($mode eq 'post-img') {
- if ($tag{'<'} eq 'img') {
- my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'});
- if ($imgkey ne '') {
- $post{'imgkey-'.$attnumber}=$imgkey;
- }
- else {
- delete $post{'img-'.$attnumber};
- --$attnumber;
- }
+ elsif ($tag{'<'} eq 'a') {
+ # there is an image attached
+ if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+ ++$attnumber;
+ $post{'img-'.$attnumber}='a_'.$2;
+ $mode = 'post-img';
}
- elsif ($tag{'<'} eq '/a') {
- $mode = 'post';
+ # the number of replies may be found in one of these links.
+ elsif ($tag{'href'} =~ /^\/comment\/replies\/?\?/) {
+ $mode = 'post-replies';
}
}
-
-
- elsif ($mode eq 'post-author') {
- if ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
- my $author = $1;
- if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
- $author = urldecode($2);
- }
- if ($post{'author'} eq '') {
- $post{'author'} = $author;
- $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
- }
- }
+ }
+
+ # an attached image
+ elsif ($mode eq 'post-img') {
+ if ($tag{'<'} eq 'img') {
+ my $imgkey = saveimg($tag{'src'},$post{'img-'.$attnumber},$$settings{'id'});
+ if ($imgkey ne '') {
+ $post{'imgkey-'.$attnumber}=$imgkey;
}
- elsif ($tag{'<'} eq '/h3') {
- $mode='post';
+ else {
+ delete $post{'img-'.$attnumber};
+ --$attnumber;
}
}
-
- elsif ($mode eq 'post-content') {
- if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
- ++$level2;
- # $post{'content'}.='<div>';
- $ignoretext=1;
- }
- elsif ($tag{'<'} eq '/div') {
- if($level2){
- --$level2;
- # $post{'content'}.='</div>';
- unless($level2){
- $ignoretext=0;
- }
+ elsif ($tag{'<'} eq '/a') {
+ $mode = 'post';
+ }
+ }
+
+ elsif ($mode eq 'post-author') {
+ # name can be found in hyperlinks
+ if ($tag{'<'} eq 'a') {
+ # there are two types of facebook user IDs
+ if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
+ my $author = $1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $author = urldecode($2);
}
- else {
- $mode = 'post';
+ if ($post{'author'} eq '') {
+ $post{'author'} = $author;
+ $post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
}
}
- elsif ($tag{'<'} eq 'br') {
- $post{'content'}.='<br>';
- }
- elsif ($tag{'<'} eq 'p') {
- $post{'content'}.='<p>';
+ }
+ elsif ($tag{'<'} eq '/h3') {
+ $mode='post';
+ }
+ }
+
+ # the post content
+ elsif ($mode eq 'post-content') {
+ # There should not be any sub<div>s. Ignore everything inside.
+ if ($tag{'<'} eq 'div') {
+ ++$level2;
+ $ignoretext=1;
+ }
+ elsif ($tag{'<'} eq '/div') {
+ if($level2){
+ --$level2;
+ unless($level2){
+ $ignoretext=0;
+ }
}
- elsif ($tag{'<'} eq '/p') {
- $post{'content'}.='</p>';
+ else {
+ $mode = 'post';
}
- elsif (!$ignoretext) {
- if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
- my $imgurl = urldecode($2);
- my $imgid = 'r_';
- $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
-
- while(length($imgurl)>240) {
- $imgid.=substr($imgurl,0,120).'-/';
- $imgurl=substr($imgurl,120);
- }
- $imgid.=$imgurl;
+ }
+ elsif ($tag{'<'} eq 'br') {
+ $post{'content'}.='<br>';
+ }
+ elsif ($tag{'<'} eq 'p') {
+ $post{'content'}.='<p>';
+ }
+ elsif ($tag{'<'} eq '/p') {
+ $post{'content'}.='</p>';
+ }
+ elsif (!$ignoretext) {
+ if (($tag{'<'} eq 'img')and($tag{'src'} =~ /^https?:\/\/([a-z0-9\.\-]+)?fbcdn\.net\/rsrc\.php\/(.*)$/)) {
+ my $imgurl = urldecode($2);
+ my $imgid = 'r_';
+ $imgurl =~ s/([^A-Za-z0-9_\.])/sprintf ("@%02X",ord($1))/eg;
- my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
- if ($imgkey ne '') {
- $post{'content'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
- }
- else {
- $post{'content'}.='<img src="" alt="'.$tag{'alt'}.'">';
- }
-
- }
- elsif ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
- $post{'content'}.='<a href="'.urldecode($3).'">';
- $link=1;
- }
- elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
- my $person = $1;
- if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
- $person = urldecode($2);
- }
- $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
- $link=1;
- $hidename=1;
- }
+ while(length($imgurl)>240) {
+ $imgid.=substr($imgurl,0,120).'-/';
+ $imgurl=substr($imgurl,120);
}
- elsif ($tag{'<'} eq '/a') {
- if($link) {
- $post{'content'}.='</a>';
- $link=0;
- $hidename=0;
- }
+ $imgid.=$imgurl;
+
+ my $imgkey = saveimg($tag{'src'},$imgid,$$settings{'id'});
+ if ($imgkey ne '') {
+ $post{'content'}.='<img src="&img@i'.$imgid.'@k'.$imgkey.';" alt="'.$tag{'alt'}.'">';
}
else {
- # $post{'content'}.='<!'.$tag{'<'}.'!>';
+ $post{'content'}.='<img src="" alt="'.$tag{'alt'}.'">';
}
}
-
- }
-
- elsif ($mode eq 'post-time') {
- if ($tag{'<'} eq '/abbr') {
- $mode = 'post';
- }
- }
-
- elsif ($mode eq 'post-replies') {
- if ($tag{'<'} eq '/a') {
- $mode = 'post';
+ elsif ($tag{'<'} eq 'a') {
+ # a link to an external page
+ if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+ $post{'content'}.='<a href="'.urldecode($3).'">';
+ $link=1;
+ }
+ # a link to a user
+ elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
+ my $person = $1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $person = urldecode($2);
+ }
+ $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $link=1;
+ $hidename=1;
+ }
}
- }
-
-
- if ($tag{"\\"} ne '') {
- $closetag = 1;
- next;
- }
-
- local $/ = '<';
- unless (defined ($text = <$contentfile>)) {
- close($contentfile);
- return;
- }
- local $/ = "\n";
- $text =~ s/<$//;
-
- # # DEBUG
- # if ($pagetype eq 'thread') {
- # print ">>$mode: $text\n";
- # }
-
- if($mode eq 'thread-content') {
- unless ($ignoretext or $hidename){
- $thread{'postcontent'}.=$text;
+ elsif ($tag{'<'} eq '/a') {
+ if($link) {
+ $post{'content'}.='</a>';
+ $link=0;
+ $hidename=0;
+ }
}
- }
- elsif ($mode eq 'thread-time') {
- $thread{'timetext'}.=$text;
- }
- elsif ($mode eq 'thread-attachment-link-title') {
- $thread{'linktitle-'.$attnumber}.=$text;
- }
- elsif ($mode eq 'thread-attachment-link') {
- $thread{'linktext-'.$attnumber}.=$text;
- }
- elsif ($mode eq 'thread-replies') {
- if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) {
- $thread{'replies'} = $1;
+ else {
+ # $post{'content'}.='<!'.$tag{'<'}.'!>';
}
}
- if($mode eq 'post-content') {
- unless ($ignoretext or $hidename){
- $post{'content'}.=$text;
- }
- }
- elsif ($mode eq 'post-time') {
- $post{'timetext'}.=$text;
+ }
+
+ elsif ($mode eq 'post-time') {
+ if ($tag{'<'} eq '/abbr') {
+ $mode = 'post';
}
- elsif ($mode eq 'post-replies') {
- if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
- $post{'replies'} = $1;
- }
+ }
+
+ elsif ($mode eq 'post-replies') {
+ if ($tag{'<'} eq '/a') {
+ $mode = 'post';
}
-
}
+ # dealing with the tag is finished.
+ # if the tag ends with "/>" chande it into an "</" tag ang go through it
+ # again from the start of the loop
+ if ($tag{"\\"} ne '') {
+ $closetag = 1;
+ next;
+ }
- close ($contentfile);
+ # now get the text between tags
+
+ local $/ = '<';
+ unless (defined ($text = <$contentfile>)) {
+ close($contentfile);
+ return;
+ }
+ local $/ = "\n";
+ $text =~ s/<$//;
+
+ # # DEBUG
+ # if ($pagetype eq 'thread') {
+ # print ">>$mode: $text\n";
+ # }
+
+ # depending on state add text to relevant fields.
+
+ if($mode eq 'thread-content') {
+ unless ($ignoretext or $hidename){
+ $thread{'postcontent'}.=$text;
+ }
+ }
+ # the format facebook uses for showing time is not always helpful (for
+ # example: "2 mins ago") There is no setting to change it in facebook.
+ # at least in the m.facebook.com. The bot corrently DOES NOT interpret the
+ # text.
+ elsif ($mode eq 'thread-time') {
+ $thread{'timetext'}.=$text;
+ }
+ elsif ($mode eq 'thread-attachment-link-title') {
+ $thread{'linktitle-'.$attnumber}.=$text;
+ }
+ elsif ($mode eq 'thread-attachment-link') {
+ $thread{'linktext-'.$attnumber}.=$text;
+ }
+ elsif ($mode eq 'thread-replies') {
+ if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) {
+ $thread{'replies'} = $1;
+ }
+ }
+
+ if($mode eq 'post-content') {
+ unless ($ignoretext or $hidename){
+ $post{'content'}.=$text;
+ }
+ }
+ elsif ($mode eq 'post-time') {
+ $post{'timetext'}.=$text;
+ }
+ elsif ($mode eq 'post-replies') {
+ if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
+ $post{'replies'} = $1;
+ }
+ }
}
+ close ($contentfile);
}
# Function to read data from datafiles.
return %data;
}
+# the function to write data to datafiles (see readdatafile() description)
+#
+# First argument can be a path or a file handle. In case of a file handle it
+# will just read the file. In case of path it opens the file before writing and
+# closes after.
+#
+# On failure (file not open) returns 0.
+# On success returns 1.
+#
sub writedatafile {
(my $headerpath, my %header) = @_;
my $headerfile;
return 1;
}
+# Function to get a timenumber from a "date" http header field value.
+# It's a 14 digit number: 4 - year, 2 - month, 2 - day, 2 - hour, 2 - minute,
+# 2 - second.
sub gettimenumber {
(my $date) = @_;
my $year;
my $minute;
my $second;
- # see https://tools.ietf.org/html/rfc2616#section-3.3.1
+ # There are 3 possible formats.
+ # See https://tools.ietf.org/html/rfc2616#section-3.3.1
if ($date =~ /^[A-Za-z]{3}, ([0-9]{2}) ([A-Za-z]{3}) ([0-9]{4}) ([0-9]{2}):([0-9]{2}):([0-9]{2})/){
$day=$1;
$month=lc($2);
return $year.$month.$day.$hour.$minute.$second;
}
+# Function to get information about a html tag.
+# The argument is the tag text without the <>!
+# It returns a hash with the attributes' values.
+# special values:
+# '<' - the tag name (may start with '/'),
+# '/' - when the tag starts with '/',
+# '\' - when the tag ends with '/',
sub taginfo {
(my $tagtext) = @_;
my %tag;
if ($tagtext =~ /\/[ \t\r\n]*$/) {
$tag{"\\"}="\\";
}
- # if ($tag{'<'} eq 'div') {
- # foreach my $ind (keys %tag) {
- # print "$ind: $tag{$ind}\n";
- # }
- # print "\n";
- # }
return %tag;
}
+# Function to generate a random hexadecimal number (key) with a defined number
+# of bits.
sub key {
(my $bits) = @_;
my $p = int($bits / 16);
return $keytext;
}
+# The function to save an image from proxy archive
+# Arguments:
+# 1 - url of image
+# 2 - image id
+# 3 - group id
+# Returns 1 on success and 0 on failure.
+#
sub saveimg {
(my $url, my $id, my $groupid) = @_;
print "Image $id\n";
+ # prepare path
$basepath=ARCH_PATH.$groupid.'/';
unless (-d $basepath){
unless (mkdir $basepath) {
}
}
-
my $idtemp = $id;
while((my $ind = index($idtemp,'/'))>=0) {
$basepath.=substr($idtemp, 0, $ind+1);
$headpath=$basepath.'@h';
$imgpath=$basepath.'@v';
+ # check if image already saved
if ( -f $imgpath) {
if (open($headfile,"+<",$headpath)) {
if (flock ($headfile, 2)) {
$archheadpath = $archbasepath.'@h';
$archimgpath = $archbasepath.'@v';
print "url: $url\n";
+ # Read the http header. Only interested in the status. If a redirection then
+ # follow it and read again. If 200 it's ok to continue processing. Otherwise
+ # return.
for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
%archheader = readheaderfile($archheadpath);
return '';
}
+ # save content-type and content-disposition information.
foreach my $ind (keys %archheader) {
if ($ind =~ /^(content-type|content-disposition)$/) {
$header{$ind}=$archheader{$ind};
}
}
+ # generate key
$header{'key'}=key(KEY_BITS);
+ # save image size
if (@stat = stat($archimgpath)){
$header{'content-length'}=$stat[7];
}
}
}
- foreach my $ind (keys %header) {
- print $headfile "$ind: $header{$ind}\n";
- print "$ind: $header{$ind}\n";
- }
+ # save header
+ writedatafile($headfile,%header);
print $headfile "\n";
close ($headfile);
+ # Finally, copy the image.
unless (copy($archimgpath,$imgpath)) {
print "Can't copy $archimgpath.\n\n";
return '';
print "saved.\n\n";
return $header{'key'};
-}
\ No newline at end of file
+}