#!/usr/bin/perl # # lj-index.pl is based on ljsm.pl (CVS revision 1.51) by # Alexander Nikolaev # Takes a directory produced my ljsm-strip.pl and ljdir.pl # and makes a contents file similar to that produced by ljsm-strip.pl. # Downloads userinfo and makes index file from it. # Uses .header and .footer files made by ljdir.pl # All work was done under GPL. # Misha Verbitsky # http://imperium.lenin.ru/~verbit my ($ua, $HEAD, $FOOT, $res, $req, %stat, %memories, %posts, %images,); use constant LOGIN => ''; # leave it empty if you don't want to login use constant PASSWORD => ''; use constant START_YEAR => 2001; # fetch data back to this year use constant CLEANUP_HTML => 1; # 0 - leave html as it is, 1 - remove sidebars etc use constant UTF8_DECODE => 1; # convert text to local charset use constant LOCAL_CHARSET => 'KOI8-R'; use constant DEBUG_LEVEL => 2; # 0 - quiet, 1 - essential, 2 - verbose use constant LOCAL_DIR => ''; # local directory to put files into. Leave it empty to put in the current directory. Slash (/, if not empty) in the end required. use constant SAVE_PICS => 1; # download userpics and standard icons (requires CLEANUP_HTML > 0) use constant HTTP_PROXY => ''; # set proxy URL if you use http proxy use constant CVSVERSION => '$Revision: 0.9 $'; # don't touch this # =================================================================== # end of public constants definition. no user-editable parts below this line # =================================================================== use constant BASE_URL => 'http://www.livejournal.com/'; use constant CATALOG_URL => BASE_URL . 'view/?type=month'; use constant LOGIN_SCRIPT => 'login.bml'; use constant POST_SCRIPT => 'talkread.bml'; use constant MEMO_SCRIPT => 'tools/memories.bml'; use constant HTML_FOOTER => "\n\n"; use constant HTML_HEADER => ' Journal entry '; my $LENTA = "f"; my $FOOTER="

Advertisement on IMPERIUM.LENIN.RU:

"; use LWP::UserAgent; use HTTP::Cookies; use HTML::Form; use HTML::SimpleLinkExtor; use File::Path; use File::Basename; use File::Find; use Date::Manip qw(ParseDate); # To convert dates of postings - M. V. if (UTF8_DECODE) { use Unicode::MapUTF8 qw(from_utf8); } use Getopt::Std; #use Data::Dumper; use strict; # sanity checks # init global vars $ua = new LWP::UserAgent; $ua->cookie_jar(new HTTP::Cookies( file => "ljcookies.txt", autosave => 1) ); # set proxy URL for LWP requests $ua->proxy('http', HTTP_PROXY) if HTTP_PROXY; if (@ARGV >0) { get_info("$ARGV[0]"); index_ljsm("$ARGV[0]"); } else { do_all(); main_index (); } sub do_all { my @topdir = glob "*"; my @users = (); foreach (@topdir) { # print "Testing the directory $_...\n"; next if (! (-d $_)); next if ($_ eq $LENTA); next if ((! (-d "$_/2001")) && (! (-d "$_/2002"))); push (@users, $_); } foreach (@users) { print "lj-index.pl: Fetching userinfo for $_...\n"; get_info($_); logmsg("rebuilding index file for user $_...\n"); index_ljsm($_); logmsg("done.\n"); } } sub destroy_everything # Ultra-dangerous! # Removes all files, except the index { my @topdir = glob "*"; my @users = (); foreach (@topdir) { next if (! (-d $_)); next if ((! (-d "$_/2001")) && (! (-d "$_/2002"))); push (@users, $_); } my $user= ""; foreach $user (@users) { my $year=0; my $month=0; print "Cleaning the directory for $user (DESTRUCTIVE!)...\n"; my $dir = "$user/memories"; my @file_list = glob("$dir/*"); my $file = ""; foreach $file (@file_list) { # print "Testing $file...\n"; if (! ($file =~ /index[0-9]*\.html/)) { unlink $file || die "Cannot unlink $file: $!\n"; } } foreach $year (2001..2002) { foreach $month (1..12) { my $dir = "$user/$year/$month"; my @file_list = glob("$dir/*"); my $file = ""; foreach $file (@file_list) { # print "Testing $file...\n"; if (! ($file =~ /index[0-9]*\.html/)) { unlink $file || die "Cannot unlink $file: $!\n"; } } } } } } ########################################################## ########################################################## ########################################################## # Get userinfo, translate it from UTF, clean HTML, # write info.html and index.html sub get_info { my $user = $_[0]; open (HDR, "< $user/.header") || warn ("couldn't open $user/.header: $!"); $HEAD= join("", ); close (HDR) || warn ("couldn't close $user/.header: $!"); open (FTR, "<$user/.footer") || warn ("couldn't open $user/.footer: $!"); $FOOT= join("", ); close (FTR) || warn ("couldn't close $user/.footer: $!"); my $content=""; #fetch info if (! ( $content = get_page("http://www.livejournal.com/userinfo.bml?user=$user&mode=full"))) { print "Error fetching $user\'s userinfo!"; return; } my $dir = LOCAL_DIR . "$user"; my $up = ".."; my $result = ''; foreach (split(/\n/, $content)) { if (// .. // ) { next if (// .. /<\/table>/); # decode UTF8 $_ = from_utf8({-string => $_, -charset => LOCAL_CHARSET}) if (UTF8_DECODE); $result .= "$_\n"; map { $images{$_} = 1} grep(m{/(img|userpic)/}, split(/\s+/, $_)); } } logmsg("processing pictures and relative links...\n", 4); $result =~ s/src='\/(userpic|img)\//src='$up\/$1\//sg; $result =~ s/src='@{[BASE_URL]}(img\/(.*?)\.gif)'/src="$up\/$1"/sg; # Remove some junk HTML $result =~ s/(.*?)<\/a>/$2/g; my $HEAD_tmp = $HEAD; $HEAD_tmp =~ s/ljdir.pl title/\u$user userinfo/; my $res_info = $HEAD_tmp. "

\u$user\'s Livejournal Userinfo

". $result . $FOOT; # replace all relative links with absolute ones $res_info =~ s/> $dir/info.html\n",2); open DF,">$dir/info.html" or die "error opening $dir/info.html for writing: $!\n"; print DF $res_info; close DF; if (scalar keys %images) { get_pics(); } # Now making the index file # Making the links to user's posts and memories # How many of them are there? my @monthly = glob ("$user/[0-9][0-9][0-9][0-9]/[0-9][0-9] $user/[0-9][0-9][0-9][0-9]/[0-9]"); my %months = (); my @monthnames = ('','Январь','Февраль','Март','Апрель','Май','Июнь', 'Июль','Август','Сентябрь','Октябрь','Ноябрь','Декабрь'); # print "Monthly...\n"; print @monthly; foreach (@monthly) { if ((/\/(2\d\d\d\/\d\d)/ || /\/(2\d\d\d\/\d)/) && (-d $_)) { $months{$1}=1; # print "Directory $1 found...\n"; } } my $tab_index = ""; # This is not very pretty. Better do something with date range. Oh well. my $year=2001; my $month=0; foreach $month (2..12) { $tab_index.=writetabindex($user,$year,$month,%months); $tab_index.= "" if ($month ==6); } $tab_index.= ""; $year=2002; foreach $month (1..10) { # print "Writetabindex:". writetabindex($user,$year,$month,%months)."\n"; $tab_index.=writetabindex($user,$year,$month,%months); $tab_index.= "" if ($month ==6); } if (-d "$user/memories") { my $entries =0; my @file_list = glob("$user/memories/*"); my $file = ""; foreach $file (@file_list) { # print "Testing $file...\n"; if (! ($file =~ /index[0-9]*\.html/)) { $entries++; } } $tab_index.="

2001:

2002:
	memories $entries entries

\n"; } else { $tab_index.= "\n"; } sub writetabindex { my ($user,$year,$month,%months) = @_; my $entries =0; my @file_list = glob("$user/$year/$month/*"); my $file = ""; foreach $file (@file_list) { # print "Testing $file...\n"; if (! ($file =~ /index[0-9]*\.html/)) { $entries++; } } my $monthline =$monthnames[$month]; if (defined ($months{"$year/$month"})) { return " $monthline
$entries entries "; } else{ return "$monthline\n"; } } # Killing more junk HTML $result =~ s#(.*?)#$2#sg; $result =~ s#(.*?)#$2#sg; $result =~ s#(.*?)#$2#sg; $HEAD_tmp = $HEAD; $HEAD_tmp =~ s/ljdir.pl title/\u$user archives/; my $res_index = $HEAD_tmp. "

\u$user

". $tab_index. "

\u$user\'s Userinfo

". $result . $FOOT; $res_index =~ s/> $dir/index.html\n",2); open DF,">$dir/index.html" or die "error opening $dir/index.html for writing: $!\n"; print DF $res_index; close DF; } sub main_index { open (HDR, "); close (HDR) || warn ("couldn't close forindex.hxt: $!"); print "Index table for friends list..."; #Let's make lenta table my @monthnames = ('','Январь','Февраль','Март','Апрель','Май','Июнь', 'Июль','Август','Сентябрь','Октябрь','Ноябрь','Декабрь'); my $lenta_tab = ""; # This is not very pretty. Better do something with the date range. Oh well. my $year=2001; my $month=0; foreach $month (2..12) { $lenta_tab .= ""; $lenta_tab.= "" if ($month ==6); } $lenta_tab.= ""; $year=2002; foreach $month (1..10) { $lenta_tab .= ""; $lenta_tab.= "" if ($month ==6); } $lenta_tab .= "

2001:		". $monthnames[$month] . " " . files_num("/LJ/$LENTA/$year/$month") ." entries, " . files_volume("/LJ/$LENTA/$year/$month") . "

2002:	". $monthnames[$month] . " " . files_num("/LJ/$LENTA/$year/$month") ." entries, " . files_volume("/LJ/$LENTA/$year/$month") . "

"; print "Done!\n"; # Now the users tab my $users_tab = "

Участники

"; my @topdir = glob "*"; my @users = (); foreach (@topdir) { # print "Testing the directory $_...\n"; next if (! (-d $_)); next if ($_ eq $LENTA); next if ((! (-d "$_/2001")) && (! (-d "$_/2002"))); push (@users, $_); } my $i =0; foreach $i (0..(@users-1)) { my $user=$users[$i]; my $mem =0; if (-d "$user/memories") { $mem = files_num("$user/memories"); } my @monthly = glob ("$user/[0-9][0-9][0-9][0-9]/[0-9][0-9] $user/[0-9][0-9][0-9][0-9]/[0-9]"); my $num =0; foreach (@monthly) { $num += files_num("$_"); } # print "User $user: $mem memories, $num entries\n"; $users_tab .= ""; $users_tab .= "" if (($i +1)%6 ==0); } $users_tab .= "

$user
$num entries"; $users_tab .= "
$mem memories " if ($mem >0 ); $users_tab .= "

"; my $res_index = $HEADER. $lenta_tab. $users_tab. $FOOTER; logmsg(">> index.html\n",2); open DF,">index.html" or die "error opening index.html for writing: $!\n"; print DF $res_index; close DF; } # These two routines take a directory and return # the total volume of all index.html files and the # number of entries. Only index.html files # are accessed this way. sub files_num { my $dir = $_[0]; my @indices = glob("$dir/index*.html"); my $count =0; foreach (@indices) { next if (! (/^\Q$dir\E\/index\d+\.html$/)); open (IND, "$_") || die "Couldn't open $_: $!\n"; while () { if (//) { $count++; } } } # print "There are $count entries in the directory $dir\n"; return $count; } sub files_volume { my $dir = $_[0]; my @indices = glob("$dir/index*.html"); my $size =0; my $csize =""; foreach (@indices) { next if (! ($_=~/^\Q$dir\E\/index\d+\.html$/)); $size += (-s); } $size = $size/1000000; $csize = sprintf ("%.2f", $size); $csize .= " Mb"; # print "The total volume of entries in $dir is $csize\n"; return "$csize"; } =item index_ljsm($user) build index file for the given user =cut sub index_ljsm { my ($user) = @_; my ($year, $month); %memories = (); %posts = (); %stat= (); my @monthnames = ('','Январь','Февраль','Март','Апрель','Май','Июнь', 'Июль','Август','Сентябрь','Октябрь','Ноябрь','Декабрь'); # skip to next dir if there is no such user unless (-d LOCAL_DIR . $user) { logmsg(LOCAL_DIR . $user . " not found."); return; } my $user = $_[0]; open (HDR, "< $user/.header") || warn ("couldn't open $user/.header: $!"); $HEAD= join("", ); close (HDR) || warn ("couldn't close $user/.header: $!"); open (FTR, "<$user/.footer") || warn ("couldn't open $user/.footer: $!"); $FOOT= join("", ); close (FTR) || warn ("couldn't close $user/.footer: $!"); # traverse directory tree calling process_html for each file found find({ wanted => \&process_html_file, preprocess => \&sort_directory }, LOCAL_DIR . $user); # write index.html open DF, ">" . LOCAL_DIR . $user . "/index-ljsm.html" or die "error opening " . LOCAL_DIR . $user . "/index-ljsm.html" . "for writing: $!\n"; my $HEAD_tmp = $HEAD; $HEAD_tmp =~ s/ljdir.pl title/Index file for $user livejournal/; print DF $HEAD_tmp; print DF "

$user\'s livejournal. "; print DF "$stat{count_memos} memories " if (scalar keys %memories); if (scalar keys %posts) { print DF " | $stat{count_posts} posts: "; foreach (sort keys %posts) { # foreach year print DF "$_ "; } print DF "\n"; } print DF "
last updated: " . (scalar localtime) . "\n"; print DF '

' . "\n"; if (scalar keys %posts) { foreach $year (reverse sort keys %posts) { # $posts{$year} is a reference to the hash of months # year header print DF "\n"; print DF '

' . "\n"; print DF '' . $year . ': '; print DF "" . $monthnames[$_+0] . " | " foreach (sort {$a <=> $b} keys %{$posts{$year}}); print DF '

' . "\n"; # year body for $month (reverse sort {$a <=> $b} keys %{$posts{$year}}) { print DF "[ $monthnames[$month] ]
\n"; print DF $posts{$year}->{$month} . "

\n"; } } } if (scalar keys %memories) { print DF ''. "\n"; print DF '

' . "\n"; print DF 'Memories: ' . "\n"; print DF '

' . "\n"; print DF "

$_

\n" foreach (sort keys %memories); } print DF $FOOT; close DF or warn "Error closing file: $!\n"; } # sort filenames so that the most recent posts go first sub sort_directory { return sort {$b <=> $a} @_; } # callback subroutine for index_ljsm # sub process_html_file { my ($line, $link, $kw, $title, $amuser, $itemid, $date, $rnum); return unless ($File::Find::dir =~ m#(\d{4}/\d{1,2}|memories)#); return unless (-s && /\.html$/); return if (/index\d*\.html/); # $_ is set to file name and we are inside target directory open DF, "<$_" or die "Error opening $File::Find::name for reading: $!\n"; # search for link, keywords, title and date while ($line = ) { $kw = $1 if ($line =~ //); $title = $1 if ($line =~ /(.*?)<\/b><\/i>/); $title = "$1" if ($line =~ /Error<\/span>
(.*)$/); $date = $1 if ($line =~ m{href="@{[BASE_URL]}users/\w+/day/\d\d\d\d/\d\d/(\d{1,2})"}); if (($line =~/