# lj-backer-upper
# http://marnanel.org/writing/how-to-back-up-your-livejournal
#
# This is an early draft of a really rather hacky script,
# shared in case it is useful.
# It is provided under the GNU GPL with no warranty.
#
# Copyright (c) 2009 Thomas Thurman (thomas at thurman org uk)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.
# THINGS TO FIX:
# 1) Linkification of URLs
# 2) Switches
# 3) Special characters in the password
# 4) Recover more graciously from errors.
#
# Patches always welcome.
use strict;
use warnings;
use LWP::UserAgent;
use URI::Escape;
use Data::Dumper;
use Time::Local;
use XML::Parser;
use HTTP::Cookies;
my $lj_username = $ENV{LJUSER};
my $lj_password = $ENV{LJPASS};
die "Please give an LJ username and password in the LJUSER and LJPASS environment variables." unless $lj_username && $lj_password;
# FIXME: If they have really silly characters in their password this will break
# Where to put all the output files
my $path = '.';
# How long to sleep between fetches (in seconds)
my $sleeping = 1;
# The site to read from
my $site = 'livejournal.com';
####################################################
my %users;
my $charflavour = undef;
my %chardata;
my $startid;
my $dumpcomments = 0;
my %charelements = map{$_=>1} qw(maxid nextid body date subject);
my %states = ( S=>'screened', D=>'deleted', F=>'frozen', A=>'active' );
####################################################
our $ua = LWP::UserAgent->new();
set_up_useragent();
get_posts();
get_comments();
print "All done.\n";
sub set_up_useragent {
# it's considerate to set a user-agent
$ua->agent("lj-backer-upper - marnanel\@livejournal.com");
# Get a cookie from LJ and use it to authenticate ourselves
my %cookiedata = get_result("mode=sessiongenerate&user=$lj_username&password=$lj_password&ipfixed=true");
my $cookie_jar = HTTP::Cookies->new();
$cookie_jar->set_cookie(undef, 'ljsession', $cookiedata{ljsession}, '/', $site);
$ua->cookie_jar ($cookie_jar);
}
sub get_posts {
my %daycounts = get_day_counts();
my $separate_days = scalar(keys(%daycounts));
my $total_posts = 0;
for (values %daycounts) { $total_posts += $_; }
print "You made $total_posts posts on $separate_days separate days.\n";
my $count_so_far = 0;
for my $date (sort keys %daycounts) {
my ($y, $m, $d) = $date =~ /(....)-(..)-(..)/;
$count_so_far++;
printf "Reading $date (%03d%%)...", ($count_so_far/$separate_days)*100;
get_day(int($y), int($m), int($d));
print " done.\n";
sleep($sleeping);
}
}
sub get_day_counts {
my %result = get_result("mode=getdaycounts&user=$lj_username&password=$lj_password");
return %result;
}
sub get_day {
my ($y, $m, $d) = @_;
my %result = get_result("mode=getevents&ver=1&selecttype=day&year=$y&month=$m&day=$d&user=$lj_username&password=$lj_password");
my %by_item;
for (my $i=1; $i<=int($result{events_count}); $i++) {
my $event = {
itemid => $result{"events_${i}_itemid"},
timestamp => $result{"events_${i}_eventtime"},
body => $result{"events_${i}_event"},
subject => $result{"events_${i}_subject"},
};
$event->{body} =~ s/\+/ /g;
$event->{body} =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
if (defined $result{"events_${i}_security"}) {
if ($result{"events_${i}_security"} eq 'private') {
$event->{lock} = 'private';
} elsif ($result{"events_${i}_security"} eq 'usemask') {
die "LJ sent us a filtered post without a filter, duh." unless defined $result{"events_${i}_allowmask"};
$event->{lock} = 'filter '.$result{"events_${i}_allowmask"};
} else {
die "Totally weird value of security: ".$result{"events_${i}_security"};
}
} else {
$event->{lock} = 'public';
}
$by_item{$event->{itemid}} = $event;
}
for (my $i=1; $i<=int($result{prop_count}); $i++) {
$by_item{$result{"prop_${i}_itemid"}}->{$result{"prop_${i}_name"}} = $result{"prop_${i}_value"};
}
# Some fields we handle specially, so we need not to print them
# as headers.
my %headers_not_to_print = map{$_=>1} qw/body subject taglist opt_preformatted current_location/;
for my $item (values %by_item) {
$item->{subject} = '(no subject)' unless $item->{subject};
my $filename = $path .'/'. subject_maker($item->{subject});
$filename .= '-'.$item->{itemid} if (-e "$filename.txt");
$filename .= '.txt';
unless ($item->{opt_preformatted}) {
$item->{body} =~ s!\s*\n!
\n!g;
# we ought to linkify URLs which aren't in tags.
# patches welcome.
}
# LJ-specific tags:
# gets replaced with ^L (there's a "see more" plugin
# which uses this); there's no really good way to handle the
# "text" parameter, so we write it into the text.
$item->{body} =~ s/\/\[$1\]<\/b>\014/g;
$item->{body} =~ s/\]*>/\014/g;
$item->{body} =~ s/\<\/lj-cut[^>]*>//g;
# lj user
$item->{body} =~ s!!\u$1!g;
open BLOSXOM, ">$filename" or die "Can't open $filename: $!";
binmode BLOSXOM, ":utf8";
print BLOSXOM $item->{subject}."\n";
# Tags has to go first (yes, it ought to use blosxom's
# metadata, but it goes its own way)
print BLOSXOM "Tags: ".$item->{taglist}."\n" if defined $item->{taglist};
for my $lj_header (keys %$item) {
next if $headers_not_to_print{$lj_header};
my $value = $item->{$lj_header};
$lj_header =~ s/_/-/g; # hyphens are nicer than underscores
print BLOSXOM "meta-lj-$lj_header: $value\n";
}
# current_location is handled specially because it's renamed
# (it's read by geotagger)
print BLOSXOM "meta-location: $item->{current_location}\n" if defined $item->{current_location};
# and now the body!
print BLOSXOM "\n$item->{body}\n";
close BLOSXOM or die "Can't close $filename: $!";
# and touch it so that the date's right
my ($year, $month, $day, $hour, $min, $sec) = $item->{timestamp} =~ /^(....)-(..)-(..) (..):(..):(..)$/;
my $newdate = timelocal($sec, $min, $hour, $day, $month-1, $year-1900);
utime($newdate, $newdate, $filename);
}
}
sub get_result {
my ($command) = @_;
my $req = HTTP::Request->new(POST => "http://www.$site/interface/flat");
$req->content_type('application/x-www-form-encoded');
$req->content($command);
my $failcount = 0;
my $max_failcount = 5;
my $res;
while ($failcount < $max_failcount) {
eval { $res = $ua->request($req); };
last unless $@ && $res->is_success();
$failcount++;
print " $@ ..." if $@;
print ' '.$res->status_line().' ...' unless $res->is_success();
sleep $failcount;
}
die "something's obviously broken! failed too often" if $failcount == $max_failcount;
die "didn't get any content-- is the network connected?" unless $res->content();
my %result = split("\n", $res->content()) or die "Only a single line of content: ".$res->content();
if ($result{success} eq 'FAIL') {
die "LJ said 'fail': $result{errmsg}";
} elsif ($result{success} ne 'OK') {
die "LJ didn't say 'ok' or 'fail'. How weird is that?";
}
delete $result{success};
delete $result{errmsg};
return %result;
}
# this is a bit copy-and-pasted and could be refactored a bit
sub get_comment_data {
my ($command) = @_;
my $req = HTTP::Request->new(GET => "http://www.$site/export_comments.bml?$command");
my $failcount = 0;
my $max_failcount = 5;
my $res;
while ($failcount < $max_failcount) {
eval { $res = $ua->request($req); };
last unless $@ && $res->is_success();
$failcount++;
print " $@ ..." if $@;
print ' '.$res->status_line().' ...' unless $res->is_success();
sleep $failcount;
}
die "something's obviously broken! failed too often" if $failcount == $max_failcount;
die "didn't get any content-- is the network connected?" unless $res->content();
return $res->content();
}
sub subject_maker {
my ($subject) = @_;
$subject = lc $subject;
$subject =~ s/^\s+//;
$subject =~ s/\s+$//;
$subject =~ s/ /\-/g;
$subject =~ s/[^a-z0-9-]//g;
$subject =~ s/^-*//;
$subject = 'subject' if $subject eq '';
return $subject;
}
##############################
# Comments stuff
sub handle_start {
my ($expat, $element, %attrs) = @_;
if ($element eq 'usermap') {
$users{$attrs{'id'}} = $attrs{'user'};
} elsif ($element eq 'comment') {
%chardata = (
%chardata,
# some things which need to default
subject => undef,
parentid => undef,
state => 'A',
# include everything wholesale
%attrs);
} elsif ($charelements{$element}) {
# expecting character data
$charflavour = $element;
$chardata{$charflavour} = '';
}
}
sub handle_end {
my ($expat, $element) = @_;
$charflavour = undef if $charflavour;
if ($dumpcomments && $element eq 'comment') {
open COMMENT, ">comment-$chardata{jitemid}-$chardata{id}.dat" or die "can't open: $!";
binmode COMMENT, ":utf8";
print COMMENT "Comment-ID: $chardata{id}\n";
print COMMENT "Parent-Comment-ID: $chardata{parentid}\n" if $chardata{parentid};
print COMMENT "Journal-Entry-ID: $chardata{jitemid}\n";
print COMMENT "From: $users{$chardata{posterid}}\n";
print COMMENT "Date: $chardata{date}\n";
print COMMENT "State: $states{$chardata{state}}\n";
print COMMENT "\n";
print COMMENT "$chardata{body}\n" if $chardata{body};
close COMMENT or die "can't close: $!";
}
}
sub handle_char {
my ($expat, $string) = @_;
$chardata{$charflavour} .= $string if $charflavour;
}
sub get_comments {
my $parser = new XML::Parser (Handlers => { Start => \&handle_start,
End => \&handle_end,
Char => \&handle_char,
});
$startid = 0;
%users = ();
while (1) {
$dumpcomments = 0;
$parser->parse(get_comment_data ("get=comment_meta&startid=$startid"));
print "Downloading comment $startid of $chardata{maxid}...\n";
$dumpcomments = 1;
sleep($sleeping);
$parser->parse(get_comment_data ("get=comment_body&startid=$startid"));
if ($chardata{nextid}) {
$startid = 1*$chardata{nextid};
$chardata{nextid} = undef;
} else {
last;
}
}
}