# lj-backer-upper # http://marnanel.org/writing/how-to-back-up-your-livejournal # # This is an early draft of a really rather hacky script, # shared in case it is useful. # It is provided under the GNU GPL with no warranty. # # Copyright (c) 2009 Thomas Thurman (thomas at thurman org uk) # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA. # THINGS TO FIX: # 1) Linkification of URLs # 2) Switches # 3) Special characters in the password # 4) Recover more graciously from errors. # # Patches always welcome. use strict; use warnings; use LWP::UserAgent; use URI::Escape; use Data::Dumper; use Time::Local; use XML::Parser; use HTTP::Cookies; my $lj_username = $ENV{LJUSER}; my $lj_password = $ENV{LJPASS}; die "Please give an LJ username and password in the LJUSER and LJPASS environment variables." unless $lj_username && $lj_password; # FIXME: If they have really silly characters in their password this will break # Where to put all the output files my $path = '.'; # How long to sleep between fetches (in seconds) my $sleeping = 1; # The site to read from my $site = 'livejournal.com'; #################################################### my %users; my $charflavour = undef; my %chardata; my $startid; my $dumpcomments = 0; my %charelements = map{$_=>1} qw(maxid nextid body date subject); my %states = ( S=>'screened', D=>'deleted', F=>'frozen', A=>'active' ); #################################################### our $ua = LWP::UserAgent->new(); set_up_useragent(); get_posts(); get_comments(); print "All done.\n"; sub set_up_useragent { # it's considerate to set a user-agent $ua->agent("lj-backer-upper - marnanel\@livejournal.com"); # Get a cookie from LJ and use it to authenticate ourselves my %cookiedata = get_result("mode=sessiongenerate&user=$lj_username&password=$lj_password&ipfixed=true"); my $cookie_jar = HTTP::Cookies->new(); $cookie_jar->set_cookie(undef, 'ljsession', $cookiedata{ljsession}, '/', $site); $ua->cookie_jar ($cookie_jar); } sub get_posts { my %daycounts = get_day_counts(); my $separate_days = scalar(keys(%daycounts)); my $total_posts = 0; for (values %daycounts) { $total_posts += $_; } print "You made $total_posts posts on $separate_days separate days.\n"; my $count_so_far = 0; for my $date (sort keys %daycounts) { my ($y, $m, $d) = $date =~ /(....)-(..)-(..)/; $count_so_far++; printf "Reading $date (%03d%%)...", ($count_so_far/$separate_days)*100; get_day(int($y), int($m), int($d)); print " done.\n"; sleep($sleeping); } } sub get_day_counts { my %result = get_result("mode=getdaycounts&user=$lj_username&password=$lj_password"); return %result; } sub get_day { my ($y, $m, $d) = @_; my %result = get_result("mode=getevents&ver=1&selecttype=day&year=$y&month=$m&day=$d&user=$lj_username&password=$lj_password"); my %by_item; for (my $i=1; $i<=int($result{events_count}); $i++) { my $event = { itemid => $result{"events_${i}_itemid"}, timestamp => $result{"events_${i}_eventtime"}, body => $result{"events_${i}_event"}, subject => $result{"events_${i}_subject"}, }; $event->{body} =~ s/\+/ /g; $event->{body} =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; if (defined $result{"events_${i}_security"}) { if ($result{"events_${i}_security"} eq 'private') { $event->{lock} = 'private'; } elsif ($result{"events_${i}_security"} eq 'usemask') { die "LJ sent us a filtered post without a filter, duh." unless defined $result{"events_${i}_allowmask"}; $event->{lock} = 'filter '.$result{"events_${i}_allowmask"}; } else { die "Totally weird value of security: ".$result{"events_${i}_security"}; } } else { $event->{lock} = 'public'; } $by_item{$event->{itemid}} = $event; } for (my $i=1; $i<=int($result{prop_count}); $i++) { $by_item{$result{"prop_${i}_itemid"}}->{$result{"prop_${i}_name"}} = $result{"prop_${i}_value"}; } # Some fields we handle specially, so we need not to print them # as headers. my %headers_not_to_print = map{$_=>1} qw/body subject taglist opt_preformatted current_location/; for my $item (values %by_item) { $item->{subject} = '(no subject)' unless $item->{subject}; my $filename = $path .'/'. subject_maker($item->{subject}); $filename .= '-'.$item->{itemid} if (-e "$filename.txt"); $filename .= '.txt'; unless ($item->{opt_preformatted}) { $item->{body} =~ s!\s*\n!
\n!g; # we ought to linkify URLs which aren't in tags. # patches welcome. } # LJ-specific tags: # gets replaced with ^L (there's a "see more" plugin # which uses this); there's no really good way to handle the # "text" parameter, so we write it into the text. $item->{body} =~ s/\/\[$1\]<\/b>\014/g; $item->{body} =~ s/\]*>/\014/g; $item->{body} =~ s/\<\/lj-cut[^>]*>//g; # lj user $item->{body} =~ s!!\u$1!g; open BLOSXOM, ">$filename" or die "Can't open $filename: $!"; binmode BLOSXOM, ":utf8"; print BLOSXOM $item->{subject}."\n"; # Tags has to go first (yes, it ought to use blosxom's # metadata, but it goes its own way) print BLOSXOM "Tags: ".$item->{taglist}."\n" if defined $item->{taglist}; for my $lj_header (keys %$item) { next if $headers_not_to_print{$lj_header}; my $value = $item->{$lj_header}; $lj_header =~ s/_/-/g; # hyphens are nicer than underscores print BLOSXOM "meta-lj-$lj_header: $value\n"; } # current_location is handled specially because it's renamed # (it's read by geotagger) print BLOSXOM "meta-location: $item->{current_location}\n" if defined $item->{current_location}; # and now the body! print BLOSXOM "\n$item->{body}\n"; close BLOSXOM or die "Can't close $filename: $!"; # and touch it so that the date's right my ($year, $month, $day, $hour, $min, $sec) = $item->{timestamp} =~ /^(....)-(..)-(..) (..):(..):(..)$/; my $newdate = timelocal($sec, $min, $hour, $day, $month-1, $year-1900); utime($newdate, $newdate, $filename); } } sub get_result { my ($command) = @_; my $req = HTTP::Request->new(POST => "http://www.$site/interface/flat"); $req->content_type('application/x-www-form-encoded'); $req->content($command); my $failcount = 0; my $max_failcount = 5; my $res; while ($failcount < $max_failcount) { eval { $res = $ua->request($req); }; last unless $@ && $res->is_success(); $failcount++; print " $@ ..." if $@; print ' '.$res->status_line().' ...' unless $res->is_success(); sleep $failcount; } die "something's obviously broken! failed too often" if $failcount == $max_failcount; die "didn't get any content-- is the network connected?" unless $res->content(); my %result = split("\n", $res->content()) or die "Only a single line of content: ".$res->content(); if ($result{success} eq 'FAIL') { die "LJ said 'fail': $result{errmsg}"; } elsif ($result{success} ne 'OK') { die "LJ didn't say 'ok' or 'fail'. How weird is that?"; } delete $result{success}; delete $result{errmsg}; return %result; } # this is a bit copy-and-pasted and could be refactored a bit sub get_comment_data { my ($command) = @_; my $req = HTTP::Request->new(GET => "http://www.$site/export_comments.bml?$command"); my $failcount = 0; my $max_failcount = 5; my $res; while ($failcount < $max_failcount) { eval { $res = $ua->request($req); }; last unless $@ && $res->is_success(); $failcount++; print " $@ ..." if $@; print ' '.$res->status_line().' ...' unless $res->is_success(); sleep $failcount; } die "something's obviously broken! failed too often" if $failcount == $max_failcount; die "didn't get any content-- is the network connected?" unless $res->content(); return $res->content(); } sub subject_maker { my ($subject) = @_; $subject = lc $subject; $subject =~ s/^\s+//; $subject =~ s/\s+$//; $subject =~ s/ /\-/g; $subject =~ s/[^a-z0-9-]//g; $subject =~ s/^-*//; $subject = 'subject' if $subject eq ''; return $subject; } ############################## # Comments stuff sub handle_start { my ($expat, $element, %attrs) = @_; if ($element eq 'usermap') { $users{$attrs{'id'}} = $attrs{'user'}; } elsif ($element eq 'comment') { %chardata = ( %chardata, # some things which need to default subject => undef, parentid => undef, state => 'A', # include everything wholesale %attrs); } elsif ($charelements{$element}) { # expecting character data $charflavour = $element; $chardata{$charflavour} = ''; } } sub handle_end { my ($expat, $element) = @_; $charflavour = undef if $charflavour; if ($dumpcomments && $element eq 'comment') { open COMMENT, ">comment-$chardata{jitemid}-$chardata{id}.dat" or die "can't open: $!"; binmode COMMENT, ":utf8"; print COMMENT "Comment-ID: $chardata{id}\n"; print COMMENT "Parent-Comment-ID: $chardata{parentid}\n" if $chardata{parentid}; print COMMENT "Journal-Entry-ID: $chardata{jitemid}\n"; print COMMENT "From: $users{$chardata{posterid}}\n"; print COMMENT "Date: $chardata{date}\n"; print COMMENT "State: $states{$chardata{state}}\n"; print COMMENT "\n"; print COMMENT "$chardata{body}\n" if $chardata{body}; close COMMENT or die "can't close: $!"; } } sub handle_char { my ($expat, $string) = @_; $chardata{$charflavour} .= $string if $charflavour; } sub get_comments { my $parser = new XML::Parser (Handlers => { Start => \&handle_start, End => \&handle_end, Char => \&handle_char, }); $startid = 0; %users = (); while (1) { $dumpcomments = 0; $parser->parse(get_comment_data ("get=comment_meta&startid=$startid")); print "Downloading comment $startid of $chardata{maxid}...\n"; $dumpcomments = 1; sleep($sleeping); $parser->parse(get_comment_data ("get=comment_body&startid=$startid")); if ($chardata{nextid}) { $startid = 1*$chardata{nextid}; $chardata{nextid} = undef; } else { last; } } }