###PERL;
# bot is generated from bot.1.pl
-# 02.01.2016
+# 13.01.2016
#
# This is the facebook bot. It depends on the proxy.
# It reads pages from m.facebook.com archived on the proxy, extracts threads,
use File::Copy;
###PROXY_LIB;
###FACEBUG_LIB;
-use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode entityencode urldecode readheaderfile);
+use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode entityencode urldecode urlencode readheaderfile);
use facebug_lib qw(key readdatafile writedatafile gettimenumber);
use POSIX qw(strftime locale_h);
###GROUPSETTINGS_PATH;
###KEY_BITS;
###MAX_REDIRECTIONS;
+###DETAILED_LOGS;
my $time = time();
srand ($time-$$);
my $closetag=0;# if there is a tag to close
my $ignoretext;# if text should be ignored and not added to post/thread content
my $link; # if bot is inside a link
+ my $namelink;
my $hidename; # if bot is inside a part which contains a name to be hidden
my $attnumber; # number of current attachment
my $incomplete;# if thread firstpost's content is incomplete
my $firstpost; # if the bot is in the firstpost (important if pagetype='post')
+ my $description;
my $content;
# thread author is in first <h3>
if (($tag{'<'} eq 'h3') and ($thread{($shared?'shared-':'').'author'} eq '')) {
$mode = 'thread-author';
+ $link=0;
+ $namelink=0;
+ unless ($shared) {
+ $description=0;
+ }
+ $hidename;
}
# elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
elsif ($tag{'<'} eq 'div') {
$ignoretext=1; # text in firstposts only inside <p>
$hidename=0;
$link=0;
+ $namelink=0;
}
# The thread contains "shared" content
elsif (($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/) and !$shared and ($thread{'shared'} eq '')) {
if (lc($tag{'aria-label'})eq 'likes') {
$mode='thread-likes';
}
- ###DUPLICADED!
+ ### this code had to be duplicated:
# there is an image attached
elsif ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
my $imgnum=$2;
$mode = 'thread-attachment-img';
}
}
- ###END OF DUPLICATED
+ ###END OF DUPLICATED CODE
# there is a link attached
elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
++$attnumber;
}
}
+ if ($description <= 0) {
+ delete $thread{'description'};
+ }
+
# overwrite previous information with new one
foreach my $ind (keys %thread) {
$thread2{$ind}=$thread{$ind};
writedatafile($threadfile,%thread2);
truncate ($threadfile , tell($threadfile));
- foreach my $ind (keys %thread2) {
- print "$ind: $thread2{$ind}\n"; ####
+ if (DETAILED_LOGS) {
+ foreach my $ind (keys %thread2) {
+ print "$ind: $thread2{$ind}\n"; ####
+ }
}
print "saved.\n\n";
}
# author name
elsif ($mode eq 'thread-author') {
# name can be found in hyperlinks
- if ($tag{'<'} eq 'a') {
+ if ($tag{'<'} eq 'span') {
+ $description=-1;
+ }
+ elsif ($tag{'<'} eq 'a') {
+ if ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+ unless ($shared) {
+ $thread{'description'}.='<a href="'.entityencode(urldecode($3)).'">';
+ $link=1;
+ }
+ }
# there are two types of facebook user IDs
- if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)\/?(\?.*)?$/) {
- my $author = $1;
- if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
- $author = urldecode($2);
+ elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)\/?(\?.*)?$/) {
+ my $person = $1;
+ unless ($person =~ /^(photo|post).php$/) {
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $person = urldecode($2);
+ }
+ unless ($shared) {
+ $thread{'description'}.='<b class="ni">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $link=1;
+ $namelink=1;
+ $hidename=1;
+ }
+
+ if ($thread{($shared?'shared-':'').'author'} eq '') {
+ $thread{($shared?'shared-':'').'author'} = $person;
+ $thread{($shared?'shared-':'').'name'} = ($$names{$person} ne '')?$$names{$person}:$$names{'default'};
+ }
+ }
+ }
+ elsif ($tag{'href'} eq '#') {
+ unless ($shared) {
+ $thread{'description'}.='<b class="ni">'.$$names{'default'};
+ $link=1;
+ $namelink=1;
+ $hidename=1;
}
if ($thread{($shared?'shared-':'').'author'} eq '') {
- $thread{($shared?'shared-':'').'author'} = $author;
- $thread{($shared?'shared-':'').'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
+ $thread{($shared?'shared-':'').'name'} = $$names{'default'};
}
}
}
+ elsif ($tag{'<'} eq '/a') {
+ if($link) {
+ if($namelink) {
+ $thread{'description'}.='</b>';
+ $namelink=0;
+ }
+ else {
+ $thread{'description'}.='</a>';
+ }
+ $link=0;
+ $hidename=0;
+ }
+ }
# go out of <h3>
elsif ($tag{'<'} eq '/h3') {
$mode='thread';
if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
$person = urldecode($2);
}
- $thread{($shared?'shared-':'').'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $thread{($shared?'shared-':'').'postcontent'}.='<b class="ni">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $link=1;
+ $namelink=1;
+ $hidename=1;
+ }
+ elsif ($tag{'href'} eq '#') {
+ $thread{($shared?'shared-':'').'postcontent'}.='<b class="ni">'.$$names{'default'};
$link=1;
+ $namelink=1;
$hidename=1;
}
}
elsif ($tag{'<'} eq '/a') {
if($link) {
- $thread{($shared?'shared-':'').'postcontent'}.='</a>';
+ if($namelink) {
+ $thread{($shared?'shared-':'').'postcontent'}.='</b>';
+ $namelink=0;
+ }
+ else {
+ $thread{($shared?'shared-':'').'postcontent'}.='</a>';
+ }
$link=0;
$hidename=0;
}
elsif ($tag{'<'} eq 'a') {
if (($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) and ($pagetype ne 'thread')) {
unless($incomplete) {
- $thread{($shared?'shared-':'').'postcontent'}.='<p><b>Post not completely archived.</b></p>';
+ $thread{($shared?'shared-':'').'postcontent'}.='<p><b class = "br">Post not completely archived.</b></p>';
}
$incomplete=1;
}
- ### HAD TO BE DUPLICATED :(
+ ### this code had to be duplicated:
# there is an image attached
$ignoretext=0;
$hidename=0;
$link=0;
+ $namelink=0;
}
else {
++$level;
writedatafile($postfile,%post2);
truncate ($postfile , tell($postfile));
- foreach my $ind (keys %post2) {
- print "$ind: $post2{$ind}\n";
+ if(DETAILED_LOGS) {
+ foreach my $ind (keys %post2) {
+ print "$ind: $post2{$ind}\n";
+ }
}
print "saved.\n\n";
}
if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
$person = urldecode($2);
}
- $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $post{'content'}.='<b class="ni">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
+ $link=1;
+ $namelink=1;
+ $hidename=1;
+ }
+ elsif ($tag{'href'} eq '#') {
+ $post{'content'}.='<b class="ni">'.$$names{'default'};
$link=1;
+ $namelink=1;
$hidename=1;
}
}
elsif ($tag{'<'} eq '/a') {
if($link) {
- $post{'content'}.='</a>';
+ if ($namelink) {
+ $post{'content'}.='</b>';
+ $namelink=0;
+ }
+ else {
+ $post{'content'}.='</a>';
+ }
$link=0;
$hidename=0;
}
$thread{($shared?'shared-':'').'postcontent'}.=$text;
}
}
+ if($mode eq 'thread-author') {
+ unless ($hidename or $shared){
+ $thread{'description'}.=$text;
+ if (($text =~ /[a-z]/) and (!$link) and ($description == 0)) {
+ $description = 1;
+ }
+ }
+ }
# the format facebook uses for showing time is not always helpful (for
# example: "2 mins ago") There is no setting to change it in facebook.
# at least in the m.facebook.com. The bot corrently DOES NOT interpret the
# text.
elsif ($mode eq 'thread-time') {
- $thread{($shared?'shared-':'').'timetext'}.=$text;
+ $thread{($shared?'shared-':'').'timetext'}.=facebooktime($timenumber,$text);
}
elsif ($mode eq 'thread-attachment-link-title') {
$thread{($shared?'shared-':'').'linktitle-'.$attnumber}.=$text;
}
}
elsif ($mode eq 'post-time') {
- $post{'timetext'}.=$text;
+ $post{'timetext'}.=facebooktime($timenumber,$text);
}
elsif ($mode eq 'post-link-title') {
$post{'linktitle-'.$attnumber}.=$text;
$archbasepath = urldiv2path($prot, $host, $port, $path, $query);
$archheadpath = $archbasepath.'@h';
$archimgpath = $archbasepath.'@v';
- print "url: $url\n";
+ if (DETAILED_LOGS) {
+ print "url: $url\n";
+ }
# Read the http header. Only interested in the status. If a redirection then
# follow it and read again. If 200 it's ok to continue processing. Otherwise
# return.
if ($location !~ /^[a-z]+:\/\//) {
$location = $prot.'://'.$host.(($port ne '')?(':'.$port):'').$location;
}
- print "Redirect: $location\n";
+ if (DETAILED_LOGS) {
+ print "Redirect: $location\n";
+ }
($prot, $host, $port, $path, $query) = divideurl($location);
$archbasepath = urldiv2path($prot, $host, $port, $path, $query);
$archheadpath = $archbasepath.'@h';
return $header{'key'};
}
+
+# function to get time from facebook time text
+# arguments: 1 - time number (when page was downloaded), 2 - facebook time text
+# returns a time number on success and the unchanged time text on failure.
+sub facebooktime {
+ (my $timenumber, my $timetext) = @_;
+ $timetext = lc ($timetext);
+
+ my $year;
+ my $month;
+ my $day;
+ my $hour;
+ my $minute;
+ my $ampm;
+
+ my $subtype;
+ my $subvalue;
+ my $returnvalue;
+
+ # There are multible possible format of facebook time text format
+
+ # Jan 2
+ # January 2
+ if ($timetext =~ /^ *([a-z]{3,}) +([0-9]+) *$/) {
+ $month = $1;
+ $day = int($2);
+ $year = int(substr($timenumber,0,4));
+ $hour='';
+ $minute='';
+ }
+ # Jan 2, 2015
+ # January 2, 2015
+ elsif ($timetext =~ /^ *([a-z]{3,}) +([0-9]+), +([0-9]{4}) *$/) {
+ $month = $1;
+ $day = int($2);
+ $year = int($3);
+ $hour='';
+ $minute='';
+ }
+ # Jan 2 at 10:02am
+ # January 2 at 10:02am
+ elsif ($timetext =~ /^ *([a-z]{3,}) +([0-9]+) +at +([0-9]{1,2}):([0-9]{2})([ap]m) *$/) {
+ $month = $1;
+ $day = int($2);
+ $year = int(substr($timenumber,0,4));
+ $hour=int($3);
+ $minute=int($4);
+ $ampm=$5;
+ if($ampm eq 'pm') {
+ $hour += 12;
+ }
+ }
+ # Jan 2, 2015 at 10:02am
+ # January 2, 2015 at 10:02am
+ elsif ($timetext =~ /^ *([a-z]{3,}) +([0-9]+), +([0-9]{4}) +at +([0-9]{1,2}):([0-9]{2})([ap]m) *$/) {
+ $month = $1;
+ $day = int($2);
+ $year = int($3);
+ $hour=int($4);
+ $minute=int($5);
+ $ampm=$6;
+ if($ampm eq 'pm') {
+ $hour += 12;
+ }
+ }
+ # 1 hr
+ # 2 hours ago
+ # etc...
+ elsif ($timetext =~ /^ *([0-9]+) +h(ou)?rs?( +ago)? *$/) {
+ $subtype='h';
+ $subvalue=int($1);
+ $hour = int(substr($timenumber,8,2));
+ }
+ # 4 mins
+ # 2 minutes ago
+ # etc...
+ elsif ($timetext =~ /^ *([0-9]+) +min(ute)?s?( +ago)? *$/) {
+ $subtype='m';
+ $subvalue=int($1);
+ $minute = int(substr($timenumber,10,2));
+ $hour = int(substr($timenumber,8,2));
+ }
+ # 2 days ago
+ # etc...
+ elsif ($timetext =~ /^ *([0-9]+) +days?( +ago)? *$/) {
+ $subtype='d';
+ $subvalue=int($1);
+ }
+ elsif ($timetext =~ /^ *yesterday +at +([0-9]{1,2}):([0-9]{2})([ap]m) */) {
+ $subtype='d';
+ $subvalue=1;
+ $hour=int($1);
+ $minute=int($2);
+ $ampm=$3;
+ if($ampm eq 'pm') {
+ $hour += 12;
+ }
+ }
+ else {
+ # print "FAIL 1 -".urlencode($timetext)."-\n";
+ return $timetext;
+ }
+
+ if ($year !~ /^[0-9]+$/) {
+ $year = int(substr($timenumber,0,4));
+ }
+ if ($month !~ /^[0-9]+$/) {
+ if ($month eq '') {
+ $month = int(substr($timenumber,4,2));
+ }
+ elsif ($month =~ /^jan/) {
+ $month=1;
+ }
+ elsif ($month =~ /^feb/) {
+ $month=2;
+ }
+ elsif ($month =~ /^mar/) {
+ $month=3;
+ }
+ elsif ($month =~ /^apr/) {
+ $month=4;
+ }
+ elsif ($month =~ /^may/) {
+ $month=5;
+ }
+ elsif ($month =~ /^jun/) {
+ $month=6;
+ }
+ elsif ($month =~ /^jul/) {
+ $month=7;
+ }
+ elsif ($month =~ /^aug/) {
+ $month=8;
+ }
+ elsif ($month =~ /^sep/) {
+ $month=9;
+ }
+ elsif ($month =~ /^oct/) {
+ $month=10;
+ }
+ elsif ($month =~ /^nov/) {
+ $month=11;
+ }
+ elsif ($month =~ /^dec/) {
+ $month=12;
+ }
+ else {
+ # print "FAIL 2\n";
+ return $timetext;
+ }
+
+ }
+ if ($day !~ /^[0-9]+$/) {
+ $day = substr($timenumber,6,2);
+ }
+
+ if ($subtype eq 'm') {
+ $minute -= $subvalue;
+ $subvalue = 0;
+ while($minute <0) {
+ $minute += 60;
+ ++$subvalue;
+ $subtype = 'h';
+ }
+ }
+ if ($subtype eq 'h') {
+ $hour -= $subvalue;
+ $subvalue = 0;
+ while($hour <0) {
+ $hour += 24;
+ ++$subvalue;
+ $subtype = 'd';
+ }
+ }
+ if ($subtype eq 'd') {
+ $day -= $subvalue;
+ if ($day <1) {
+ --$month;
+ if($month<1) {
+ --$year;
+ $month=12;
+ $day = 31;
+ }
+ else {
+ if ($month == 2) {
+ if($year%4) {
+ $day = 28;
+ }
+ elsif ($year%100) {
+ $day = 29;
+ }
+ elsif ($year%400) {
+ $day = 28;
+ }
+ else {
+ $day = 29;
+ }
+ }
+ elsif ((($month%2) and ($month < 8)) or ((!($month%2)) and ($month >= 8))) {
+ $day = 31;
+ }
+ else {
+ $day = 30;
+ }
+ }
+ }
+ }
+
+ $returnvalue= sprintf('%04u%02u%02u',$year,$month,$day);
+ if($hour ne '') {
+ $returnvalue.=sprintf('%02u',$hour);
+ if($minute ne '') {
+ $returnvalue.=sprintf('%02u',$minute);
+ }
+ }
+ return $returnvalue;
+}
\ No newline at end of file
###PERL;
# interface.pl is generated from interface.1.pl
-# 05.01.2015
+# 13.01.2016
#
# This is the software of the facebook interface, to access archived groups,
# threads, images, etc.
}
$$postcontent =~ s/&img([^;]+);/insertimg($1)/ge;
print '<div class="post-'.($even?'0':'1').'" id="'.(($pmode eq 'thread')?'':($idt[1].'-'.(($pmode eq 'post')?'':($idt[2].'-')))).$postid.'">';
- print '<div class="post-name">'.entityencode($$post{'name'}).(($$post{'timetext'} ne '')?(' • '.$$post{'timetext'}):'').(($$post{'likes'} ne '')?(' • likes: <b>'.$$post{'likes'}.'</b>'):'').(($$post{'replies'} ne '')?(' • replies: <b>'.$$post{'replies'}.'</b>'):'').'</div>'."\n";
- print '<div class="post-content-'.($even?'0':'1').'">'.$$postcontent.'</div>'."\n";
+ print '<div class="post-name">'.entityencode($$post{'name'}).(($$post{'timetext'} ne '')?(' • '.showtime($$post{'timetext'})):'').(($$post{'likes'} ne '')?(' • likes: <b>'.$$post{'likes'}.'</b>'):'').(($$post{'replies'} ne '')?(' • replies: <b>'.$$post{'replies'}.'</b>'):'').'</div>'."\n";
+ print '<div class="post-content-'.($even?'0':'1').'">';
+ if ($$post{'description'} ne '') {
+ print '<p><b class = "br">'.$$post{'description'}.'</b></p>';
+ }
+ print $$postcontent.'</div>'."\n";
# "shared post" (post inside a post)
if ($$post{'shared'}) {
}
$$postcontent =~ s/&img([^;]+);/insertimg($1)/ge;
print '<div class="post-'.($even?'1':'0').'" id="'.(($pmode eq 'thread')?'':($idt[1].'-'.(($pmode eq 'post')?'':($idt[2].'-')))).'-shared'.$postid.'">';
- print '<div class="post-name">'.entityencode($$post{'shared-name'}).(($$post{'shared-timetext'} ne '')?(' • '.$$post{'shared-timetext'}):'').'</div>'."\n";
- print '<div class="post-content-'.($even?'1':'0').'">'.$$postcontent.'</div>'."\n";
+ print '<div class="post-name">'.entityencode($$post{'shared-name'}).(($$post{'shared-timetext'} ne '')?(' • '.showtime($$post{'shared-timetext'})):'').'</div>'."\n";
+ print '<div class="post-content-'.($even?'1':'0').'">';
+ if ($$post{'shared-description'} ne '') {
+ print '<p><b class = "br">'.$$post{'shared-description'}.'</b></p>';
+ }
+ print $$postcontent.'</div>'."\n";
# shared posts can have attachments too
$first=1;
# reverse order
print '<form class="inline" method="post" action="'.GROUP_PATH.'/'.$threadid.'" ><input type="hidden" name="skey" value="'.$skey.'">';
if($rev){
- print '<input type="submit" value="show chronological" class="button">';
+ print '<input type="submit" value="show oldest first" class="button">';
}
else {
- print '<input type="submit" value="show antichronological" class="button"><input type="hidden" name="rev" value="1">';
+ print '<input type="submit" value="show newest first" class="button"><input type="hidden" name="rev" value="1">';
}
print '</form>'."\n";
}
next;
}
$even = !$even;
- print '<tr class="'.($even?'list-entry-0':'list-entry-1').'"><td class="list-cell">'.(($group{'name'}ne'')?(entityencode($group{'name'})):$groupid).'</td><td class="list-cell"><form method="post" action="'.GROUP_PATH.'/'.$groupid.'" class="inline"><input type="hidden" name="skey" value="'.$skey.'"><input type="submit" value="show chronological" class="button"></form> <form method="post" action="'.GROUP_PATH.'/'.$groupid.'" class="inline"><input type="hidden" name="skey" value="'.$skey.'"><input type="hidden" name="rev" value="1"><input type="submit" value="show antichronological" class="button"></form></td></tr>'."\n";
+ print '<tr class="'.($even?'list-entry-0':'list-entry-1').'"><td class="list-cell">'.(($group{'name'}ne'')?(entityencode($group{'name'})):$groupid).'</td><td class="list-cell"><form method="post" action="'.GROUP_PATH.'/'.$groupid.'" class="inline"><input type="hidden" name="skey" value="'.$skey.'"><input type="submit" value="show oldest first" class="button"></form> <form method="post" action="'.GROUP_PATH.'/'.$groupid.'" class="inline"><input type="hidden" name="skey" value="'.$skey.'"><input type="hidden" name="rev" value="1"><input type="submit" value="show newest first" class="button"></form></td></tr>'."\n";
}
print '</table>'."\n";
}
# if password confirmed create the key and the access file
- $key=$time.'f'.key(KEY_BITS);
+ $key=key(KEY_BITS);
$accesspath=ACCESS_PATH.$key;
open ($accessfile,">",$accesspath) or return loginpage("Couldn't create temporary file $accesspath.","Status: 500 Internal Server Error\n");
my $dir;
print '<form class="inline" method="post" action="'.INTERFACE_PATH.'">';
- print '<input type="submit" value="show the previous page" class="button">';
+ print '<input type="submit" value="continue to the previously requested page" class="button">';
print '<input type="hidden" name="skey" value="'.entityencode($key).'">';
print '<input type="hidden" name="mode" value="'.entityencode($cgi{'rmode'}).'">';
if ($cgi{'rid'} ne '') {
}
return;
}
+
+sub showtime {
+ (my $timetext) = @_;
+ my $returnvalue;
+
+ if ($timetext =~ /^[0-9]+$/) {
+ $returnvalue = int(substr($timetext,6,2)).'.'.substr($timetext,4,2).'.'.int(substr($timetext,0,4));
+ if (length($timetext) >= 10) {
+ $returnvalue.= ', '.int(substr($timetext,8,2));
+ if (length($timetext) >= 12) {
+ $returnvalue.= ':'.substr($timetext,10,2);
+ }
+ }
+ }
+ else {
+ $returnvalue = $timetext;
+ }
+ return $returnvalue;
+}