#!/usr/bin/perl -sw- # ---------------------------------------------------------- # http2mbox # aka "sucker" # # Meng Weng Wong # $Id$ # suck down a given index dir and write out an mbox file. # # usage: # # arguments: # # flags: # # input: # # output: # # license: GPL # # part of the rss/email / stubemail project. # ---------------------------------------------------------- # ---------------------------------------------------------- # business rules # ---------------------------------------------------------- my $doc = <<"EODOC"; This program operates on behalf of the end-user Receiver. The end-user Receiver has one or more Whitelisted Senders. The end-user Receiver has one or more Stublist Notification Servers. When a Sender mails a Receiver; - the Sender Server sends a Stub to the Receiver's Stublist Notification Server. - the Sender Server writes the full message to an Outbound Mailbox. The Stublist Notification Servers and Whitelisted Senders are both recorded inside the End-User Addressbook, ~/.addressbook-stub. When the Receiver checks mail: Stublist Notifications: - for each Stublist Notification Server the Receiver uses, the Receiver asks the Stublist Notification Server for a list of all the stubs received since the last time the Receiver polled the Stublist Notification Server. Preparation: - some of the Stubs are from whitelisted senders. Some are not. - all Stubs from whitelisted senders cause those senders to be added to the Poll List. - all Stubs from non-whitelisted senders cause those senders to be added to the Unknown Sender List. - any senders in the Sender Whitelist who: - were not already represented in the Stublist - have not been polled in $POLL_INTERVAL (typically 1 day) are then added to the Poll List. Polling: - for each sender in the Poll List, the Receiver connects to the Sender Server and asks for an index of new messages. The Receiver passes a "since" parameter indicating which messages should be considered "new". The "since" parameter is coded in Unix UTC epoc seconds. The "since" parameter represents the date of the most recent message received by the Receiver from that Sender. - after receiving the index of new messages, the Receiver asks for the content of each new message. - the Receiver updates its "since" value for that Sender to be the timestamp of the youngest message transferred. - the Receiver updates its "last polled" value for that Sender to be the current time. Unknown Senders: - for each sender in the Unknown Sender list, verify the assertions presented in the Stub. EODOC # ---------------------------------------------------------- # initialization # ---------------------------------------------------------- my $QUERY_INTERVAL = 86400; # only query feeds once a day, UNLESS the stublist contains a given sender/url. # ---------------------------------------------------------- # no user-serviceable parts below this line # ---------------------------------------------------------- use strict; use LWP; use LWP::UserAgent; use HTML::Parser; use URI; use Data::Dumper; use IO::File; my @stublist_notification_urls = @ARGV; my $ua = LWP::UserAgent->new; $ua->agent("http2mbox/0.1 "); my $fh = IO::File->new; my $addressbook_file = "$ENV{HOME}/.addressbook-stub"; my $addressbook_fh = $fh->open("< $addressbook_file") or die "unable to open $addressbook_file: $!\n"; my @addressbook = parse_addressbook($addressbook_fh); my %addressbook_by_url; my %addressbook_by_sender; foreach (@addressbook) { push @{$addressbook_by_url{$_->{url}}}, $_ if $_->{url}; push @{$addressbook_by_sender{$_->{sender}}}, $_ if $_->{sender}; } my $new_addressbook_file = "$addressbook_file.new.$$"; my @new_addressbook; open (NEW_ADDRESSBOOK, "> $new_addressbook_file") or die "unable to open new $addressbook_file.new for writing: $!\n"; $/=""; my %stubs_by_url; my @stubs_not_in_addressbook; my @stublist = get_stublist_notifications(); # populates %stubs_by_url with feeds that have sent us stub notifications my @addresses_to_check = want_to_check(addressbook => \@addressbook, stublist => \@stublist); foreach my $stanza (@addresses_to_check) { if (not $stanza->{url}) { push @new_addressbook, $stanza; next; } # URL of the sender's feed index my $url = $stanza->{url}; # if we only want to query, say, once a day, we add 86400 to this value my $last_time_i_queried_this_url = $stanza->{last_time_i_queried_this_url}; # we ask the server "have you got anything for me more recent than this?" my $last_activity_seen_at_url = $stanza->{last_activity_seen_at_url}; print STDERR "url=$url\n"; my $want_to_check = 0; if (time - $last_time_i_queried_this_url > $QUERY_INTERVAL) { $want_to_check++; print STDERR "*** we last checked $url more than $QUERY_INTERVAL ago.\n"; } elsif (stub_received_for($url)) { $want_to_check++; } if (not $want_to_check) { push @new_addressbook, \%stanza; next; } # request only entries that are > $last_activity_seen_at_url. my $req = HTTP::Request->new(GET => "$url?since=$last_activity_seen_at_url"); # Pass request to the user agent and get a response back my $res = $ua->request($req); # Check the outcome of the response if ($res->is_success) { my @entries = traverse_index(last_activity_seen_at_url => $last_activity_seen_at_url, url => $url, index_content => $res->content, ); if (@entries) { print_entries(entries => \@entries); my $new_last_activity_seen_at_url = $entries[-1]->{mtime}; push @new_addressbook, { %stanza, last_time_i_queried_this_url=>time, last_activity_seen_at_url=>$new_last_activity_seen_at_url }; next; } else { push @new_addressbook, { %stanza, last_time_i_queried_this_url=>time, last_activity_seen_at_url=>$last_activity_seen_at_url }; next; } } else { print STDERR "unable to get $url\n"; } } foreach my $stanza (@new_addressbook) { print NEW_ADDRESSBOOK map { "$_ = $stanza->{$_}\n" } sort keys %$stanza; print NEW_ADDRESSBOOK "\n"; } close NEW_ADDRESSBOOK; rename($addressbook_file, "$addressbook_file." . time); rename($new_addressbook_file, $addressbook_file); # ---------------------------------------------------------- # chunks of text # ---------------------------------------------------------- # ---------------------------------------------------------- # main # ---------------------------------------------------------- # ---------------------------------------------------------- # functions # ---------------------------------------------------------- sub get_stublist_notifications { # http://blizzard.pobox.com/~mengwong/stubmail/stublist.cgi/mengwong@rssemail.mailzone.com foreach my $stublist_notification_url (@stublist_notification_urls) { print STDERR "*** getting stublist notifications from $stublist_notification_url\n"; # request only entries that are > $last_activity_seen_at_url. my $req = HTTP::Request->new(GET => $stublist_notification_url); # Pass request to the user agent and get a response back my $res = $ua->request($req); # Check the outcome of the response if ($res->is_success) { print STDERR "*** success.\n"; for my $stanza (split /\n\n/, $res->content) { my @stanza_lines = split /\n/, $stanza; next if not @stanza_lines; my %stanza = map { split /\s*=\s*/, $_, 2 } grep { not /^\s*\#/ } @stanza_lines; if (not $stanza->{url}) { next; } # URL of the sender's feed index my $url = $stanza->{url}; $stubs_by_url{$stanza->{url}}++; } } else { print STDERR "unable to get $stublist_notification_url\n"; } } } sub stub_received_for { my ($url) = @_; return $stubs_by_url{$url}; } sub traverse_index { my %args = @_; my $last_activity_seen_at_url = $args{last_activity_seen_at_url}; my $index_content = $args{index_content}; my @entries; print STDERR $args{index_content}; for (split /\n/, $index_content) { next unless s/^entry: //; print STDERR "$_\n"; my @entry = split ' '; my %entry = map { split /=/, $_, 2 } @entry; print STDERR "*** mtime=$entry{mtime}\n"; print STDERR "*** filename=$entry{filename}\n"; $entry{url} = "$args{url}/$entry{filename}"; print STDERR "constructed url = $entry{url}\n"; push @entries, \%entry; } return @entries; } sub print_entries { my %args = @_; my @entries = @{$args{entries}||[]}; foreach my $entry (@entries) { # request only entries that are > $last_activity_seen_at_url. my $req = HTTP::Request->new(GET => $entry->{url}); # Pass request to the user agent and get a response back my $res = $ua->request($req); # Check the outcome of the response if ($res->is_success) { print $res->content; } else { print STDERR "unable to get $entry->{url}\n"; } } } sub parse_addressbook { my $fh = shift; my @toreturn; while (<$fh>) { my @toreturn; my @stanza_lines = split /\n/, $_; next if not @stanza_lines; my %stanza = map { split /\s*=\s*/, $_, 2 } grep { not /^\s*\#/ } @stanza_lines; push @toreturn, \%stanza; } return @toreturn; } # ---------------------------------------------------------- # format statements # ----------------------------------------------------------