use Fcntl;
use File::Copy;
###PROXY_LIB;
-use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl readconfigfile entitydecode urldecode);
+use proxy_lib qw(url2path urldiv2path path2urldiv getcgi divideurl joinurl readconfigfile entitydecode urldecode);
use POSIX qw(strftime);
###ARCH_PATH;
my %post;
my %post2;
+ my $pagetype;
+
if ($headerpath =~ /^((.+)\@h)$/) {
$headerpath = $1;
$basepath = $2;
($prot, $host, $port, $path, $query) = path2urldiv($basepath);
+ print 'Page '.joinurl($prot, $host, $port, $path, $query)."\n";
+ ### REDESIGN THE CONDITIONS!
if($query ne '') {
%cgi=getcgi($query);
$id = $cgi{'id'};
+ if ($id =~ /^[0-9]+$/) {
+ $pagetype='thread';
+ }
+ else {
+ $pagetype = 'group';
+ }
}
else {
$id='';
+ $pagetype = 'group';
}
for (my $ind=0; $ind<MAX_REDIRECTIONS; ++$ind) {
}
}
- if ($id =~ /^[0-9]+$/) {
- print "Thread $id\n";
-
- $thread{'id'}=$id;
- $thread{'groupid'}=$groupid;
- $thread{'timenumber'}=$timenumber;
-
- my $line;
-
- my %postdata;
+ # REDESIGN THE CONDITIONS!
+ if ($pagetype) {
unless (open ($contentfile, "<",$contentpath)) {
- print "Can't open file";
+ print "Can't open $contentpath.\n";
return;
}
my $text;
my %tag;
- my $mode = 'thread';
- my $level = 0;
+ my $mode;
+ my $level;
my $level2;
my $closetag=0;
my $ignoretext;
my $link;
my $hidename;
my $attnumber;
+ my $incomplete;
+
+ if ($pagetype eq 'thread') {
+ print "Thread $id\n";
+
+ $thread{'id'}=$id;
+ $thread{'groupid'}=$groupid;
+ $thread{'timenumber'}=$timenumber;
+ $mode = 'thread';
+ $level=0;
+ $attnumber=0;
+ $incomplete=0;
+ }
+ else { #group
+ print "Threads\n";
+ $mode = 'threads';
+ }
+
+ my $line;
local $/ = '<';
unless (defined ($text = <$contentfile>)) {
return;
}
$text =~ s/>$//;
- # print "tag: $text\n";
+ # # DEBUG:
+ # if($pagetype eq 'thread'){
+ # print ">>$mode: <$text>\n";
+ # }
%tag = taginfo($text);
}
local $/ = "\n";
- if ($mode eq 'thread'){
+ if ($mode eq 'threads'){
+ if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([a-zA-Z0-9]_[a-zA-Z0-9]_[a-zA-Z0-9])$/)) {
+ print "Thread [$1]\n";
+ $mode = 'thread';
+ %thread = ();
+ $thread{'groupid'}=$groupid;
+ $thread{'timenumber2'}=$timenumber;
+ $level = 0;
+ $attnumber=0;
+ $incomplete=0;
+ }
+ }
+
+ elsif ($mode eq 'thread'){
+ # print "+++$text+++\n";
if ($tag{'<'} eq 'h3') {
$mode = 'thread-author';
}
- elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bj|bk|bm)$/)) { # These are very helpful names, facebug, thank you!
- $mode='thread-content';
- $level=0;
- $ignoretext=1;
- $hidename=0;
- $link=0;
+ elsif (($tag{'<'} eq 'div')and($tag{'id'} !~ /^ufi_/)) {
+ # These are very 'helpful' class names, facebug, thank you!
+ # After recent changes do I still need the unreliable class name?
+ # Is post content always in first <div> after author with a 2 letter class name?
+ # Let's test:
+ if (($tag{'class'} =~ /^[a-z]{2}$/) and (!defined($thread{'postcontent'})) and (defined($thread{'author'}))) {
+ # if (($tag{'class'} =~ /^(bj|bk|bm)|(db|da)$/) and (!defined($thread{'postcontent'}))) {
+ $mode='thread-content';
+ $level2=0;
+ $ignoretext=1;
+ $hidename=0;
+ $link=0;
+ }
+ # elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)|(dc|db)$/)) { ### NAMES NOT RELIABLE! HAVE TO IMPROVE SERIOUSLY!
+ # $mode='thread-attachment';
+ # $level2=0;
+ # $attnumber=0;
+ # }
+ else {
+ ++$level;
+ }
+ }
+ elsif (($tag{'<'} eq '/div') and $level) {
+ --$level;
}
+
elsif ($tag{'<'} eq 'abbr') {
$mode = 'thread-time';
}
- elsif (($tag{'<'} eq 'div') and ($tag{'class'} =~ /^(bn|bl)$/)) {
- $mode='thread-attachment';
- $level=0;
- $attnumber=0;
+
+ elsif ($tag{'<'} eq 'a') {
+ if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+ ++$attnumber;
+ $thread{'img-'.$attnumber}='a_'.$2;
+ $mode = 'thread-attachment-img';
+ }
+ elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+ ++$attnumber;
+ $thread{'link-'.$attnumber}=urldecode($3);
+ $mode = 'thread-attachment-link';
+ }
+ elsif ($tag{'href'} =~ /^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/) {
+ if ($thread{'id'} eq '') {
+ $thread{'id'} = $2;
+ print "Thread $thread{'id'}\n";
+ }
+ $mode = 'thread-replies';
+ }
}
- elsif (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/)){
- $mode='posts';
+ elsif ((($tag{'<'} eq 'div') and ($tag{'id'} =~ /^ufi_/))or(($tag{'<'} eq '/div') and ($level ==0))) {
+ if ($pagetype eq 'thread') {
+ $mode='posts';
+ }
+ else {
+ $mode='threads';
+ }
my $threadfile;
my $threadpath = ARCH_PATH.$$settings{'id'}.'/';
if (flock ($threadfile, 2)) {
%thread2 = readdatafile($threadfile);
- if (($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'})) {
+ if ((($pagetype eq 'thread')and($thread2{'timenumber'} ne '')and($thread2{'timenumber'}>$thread{'timenumber'}))or(($pagetype ne 'thread')and($thread2{'timenumber2'} ne '')and($thread2{'timenumber2'}>$thread{'timenumber2'}))) {
print ("Newer version already saved.\n\n");
}
else {
+ if($pagetype ne 'thread'){
+ if(($thread2{'timenumber'} ne '')and($thread2{'timenumber'} > $thread{'timenumber2'})) {
+ print ("Newer version of post content already saved.\n");
+ delete $thread{'postcontent'};
+ }
+ elsif($incomplete) {
+ print ("Post content incomplete.\n");
+ if(defined($thread2{'postcontent'})){
+ delete $thread{'postcontent'};
+ }
+ }
+ else {
+ $thread{'timenumber'}=$thread{'timenumber2'};
+ }
+ }
foreach my $ind (keys %thread2) {
if($ind =~ /^((img(key)?)|(link(text|title)?))-[0-9]+$/) {
delete $thread2{$ind};
if ($tag{'<'} eq 'a') {
if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
my $author = $1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $author = urldecode($2);
+ }
if ($thread{'author'} eq '') {
$thread{'author'} = $author;
$thread{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
elsif ($mode eq 'thread-content') {
if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
- ++$level;
- $thread{'postcontent'}.='<div>';
+ ++$level2;
+ # $thread{'postcontent'}.='<div>';
+ $ignoretext=1;
}
elsif ($tag{'<'} eq '/div') {
- if($level){
- --$level;
- $thread{'postcontent'}.='</div>';
+ if($level2){
+ --$level2;
+ # $thread{'postcontent'}.='</div>';
+ unless($level2) {
+ $ignoretext=0;
+ }
}
else {
$mode = 'thread';
}
}
+ elsif ($tag{'<'} eq 'br') {
+ $thread{'postcontent'}.='<br>';
+ $ignoretext=0;
+ }
elsif ($tag{'<'} eq 'p') {
$thread{'postcontent'}.='<p>';
$ignoretext=0;
$link=1;
}
elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
- $thread{'postcontent'}.='<a href="#">'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'});
+ my $person=$1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $person = urldecode($2);
+ }
+ $thread{'postcontent'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
$link=1;
$hidename=1;
}
# $thread{'postcontent'}.='<!'.$tag{'<'}.'!>';
}
}
-
+ elsif(($tag{'<'} eq 'a') and ($tag{'href'}=~/^\/groups\/$$settings{'id'}\/?\?(.*&)?id=([^&]+)(&.*)?$/)) {
+ unless($incomplete) {
+ $thread{'postcontent'}.='<p><b>Post not completely archived.</b></p>';
+ }
+ $incomplete=1;
+ }
}
elsif ($mode eq 'thread-time') {
}
}
- elsif ($mode eq 'thread-attachment') {
- if ($tag{'<'} eq 'div') {
- ++$level;
- }
- elsif ($tag{'<'} eq '/div') {
- if($level){
- --$level;
- }
- else {
- $mode = 'thread';
- }
- }
- elsif ($tag{'<'} eq 'a') {
- if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
- ++$attnumber;
- $thread{'img-'.$attnumber}='a_'.$2;
- $mode = 'thread-attachment-img';
- }
- elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
- ++$attnumber;
- $thread{'link-'.$attnumber}=urldecode($3);
- $mode = 'thread-attachment-link';
- }
- }
- }
+
+ # elsif ($mode eq 'thread-attachment') {
+ # if ($tag{'<'} eq 'div') {
+ # ++$level2;
+ # }
+ # elsif ($tag{'<'} eq '/div') {
+ # if($level2){
+ # --$level2;
+ # }
+ # else {
+ # $mode = 'thread';
+ # }
+ # }
+ # elsif ($tag{'<'} eq 'a') {
+ # if ($tag{'href'} =~ /^\/photo\.php\?(.*&)?fbid=([0-9]+)(&.*)?$/) {
+ # ++$attnumber;
+ # $thread{'img-'.$attnumber}='a_'.$2;
+ # $mode = 'thread-attachment-img';
+ # }
+ # elsif ($tag{'href'} =~ /^https?:\/\/([a-z0-9\.\-]+)?facebook\.com\/l\.php\?(.*&)?u=([^&]+)(&.*)?$/) {
+ # ++$attnumber;
+ # $thread{'link-'.$attnumber}=urldecode($3);
+ # $mode = 'thread-attachment-link';
+ # }
+ # }
+ # }
elsif ($mode eq 'thread-attachment-img') {
if ($tag{'<'} eq 'img') {
}
}
elsif ($tag{'<'} eq '/a') {
- $mode = 'thread-attachment';
+ $mode = 'thread';
}
}
}
}
elsif ($tag{'<'} eq '/a') {
- $mode = 'thread-attachment';
+ $mode = 'thread';
}
}
}
}
+ elsif ($mode eq 'thread-replies') {
+ if ($tag{'<'} eq '/a') {
+ $mode = 'thread';
+ }
+ }
+
elsif ($mode eq 'posts') {
if (($tag{'<'} eq 'div') and ($tag{'id'} =~ /^([0-9]+)$/)) {
$mode = 'post-time';
}
if ($tag{'<'} eq 'div') {
- if(($tag{'class'} eq '')and($post{'content'} eq '')) {
+ if(($tag{'class'} eq '')and(!defined($post{'content'}))) {
$mode = 'post-content';
$level2=0;
$ignoretext=0;
if ($tag{'<'} eq 'a') {
if ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)?$/) {
my $author = $1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $author = urldecode($2);
+ }
if ($post{'author'} eq '') {
$post{'author'} = $author;
$post{'name'} = ($$names{$author} ne '')?$$names{$author}:$$names{'default'};
elsif ($mode eq 'post-content') {
if ($tag{'<'} eq 'div') { # There should not be any sub<div>s!
++$level2;
- $post{'content'}.='<div>';
+ # $post{'content'}.='<div>';
+ $ignoretext=1;
}
elsif ($tag{'<'} eq '/div') {
if($level2){
--$level2;
- $post{'content'}.='</div>';
+ # $post{'content'}.='</div>';
+ unless($level2){
+ $ignoretext=0;
+ }
}
else {
$mode = 'post';
$link=1;
}
elsif ($tag{'href'} =~ /^\/([A-Za-z0-9\.]+)(\?.*)$/) {
- $post{'content'}.='<a href="#">'.(($$names{$1} ne '')?$$names{$1}:$$names{'default'});
+ my $person = $1;
+ if ($tag{'href'} =~ /^\/profile\.php\?(.*&)?id=([^&]+)(&.*)?$/) {
+ $person = urldecode($2);
+ }
+ $post{'content'}.='<a href="#">'.(($$names{$person} ne '')?$$names{$person}:$$names{'default'});
$link=1;
$hidename=1;
}
}
}
else {
- $post{'content'}.='<!'.$tag{'<'}.'!>';
+ # $post{'content'}.='<!'.$tag{'<'}.'!>';
}
}
local $/ = "\n";
$text =~ s/<$//;
+ # # DEBUG
+ # if ($pagetype eq 'thread') {
+ # print ">>$mode: $text\n";
+ # }
+
if($mode eq 'thread-content') {
unless ($ignoretext or $hidename){
$thread{'postcontent'}.=$text;
elsif ($mode eq 'thread-attachment-link') {
$thread{'linktext-'.$attnumber}.=$text;
}
+ elsif ($mode eq 'thread-replies') {
+ if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+comments?/) {
+ $thread{'replies'} = $1;
+ }
+ }
if($mode eq 'post-content') {
unless ($ignoretext or $hidename){
$post{'timetext'}.=$text;
}
elsif ($mode eq 'post-replies') {
- if($text =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
+ if(lc($text) =~ /^[ \t\r\n]*([0-9]+)[ \t\r\n]+repl(y|ies)/) {
$post{'replies'} = $1;
}
}