#!/usr/bin/perl

#    ______ _______  _____  _     _ _______ _______ _______
#   |_____/ |______ |   __| |     | |______ |______    |
#   |    \_ |______ |____\| |_____| |______ ______|    |
#
#                  _______  _____   ______
#                  |______ |     | |_____/
#                  |       |_____| |    \_
#
#     ______ _______ _______  _____  _    _ _______
#    |_____/ |______ |  |  | |     |  \  /  |_____| |
#    |    \_ |______ |  |  | |_____|   \/   |     | |_____

# This script asks Google Groups for Messages created by $author and
# prints a list of all found Message-IDs.
# v1.0 released by Dragan Espenschied
# This Software is in the Public Domain.
# v1.1 released by Julian Wiersbitzki
# Changes:
# - Google changed HTML-Code of Message-Body, $message_body customized.
# - Also a file with Google-URLs for messages is created. These URLs can also be used as request for removal.

use strict;

my $author = 'i.am@example.com';          # author's email address goes here

use LWP::UserAgent;
use URI::Escape;



my %groups_messages;                            # this hash will contain all found message IDs

                                                # Fake Browser
my $ua =  LWP::UserAgent->new(agent => 'Mozilla/5.0 (Linux; U; appSysName i686; de; rv:1.7.5) Gecko/20041108
Firefox/1.0');

my $result_page_counter = 0;                    # we are on this serp

my $more = 1;                                   # are new links found or did we
                                                # reach the end of the list?

while($more == 1) {                             # while new links are found

   print "SERP page: $result_page_counter\n";

   my $request_uri =                            # construct uri with serp number
      'http://groups.google.com/groups?q=author%3A'.uri_escape($author).
      '&start='.$result_page_counter.
      '&hl=de&lr=&num=100&filter=0';

   my $response = $ua->get($request_uri);

   unless($response->code == 200) {             # test on HTTP error codes
      $more = 0;
      die "Error! $request_uri\nHTTP-Status: ".$response->code."\n";
   }

   my $google_result_page = $response->content; # get SERP


                                                # Check on google spyware detection
   if($google_result_page =~ /<title>403 Forbidden<\/title>/m) {
      $more = 0;
      die "Error! Google thinks this is malicious software.\nPlease try again later.\n";
   }

                                                # look on the serp for links that contain
                                                # a ref to /groups, catch group name and
                                                # google's hash identifier
   my $counter = 0;
   while($google_result_page =~ /<a\s+href="\/group\/(\S+)\/browse_thread\/[^"]+#([0-9a-z]+)"/sg) {

      unless(exists($groups_messages{$2})) {    # this is a yet unknown message
         $groups_messages{$2}{group} = $1;      # save group name and google hash identifier
         print "$2 -- $1\n";
         $counter++;
      } else {                                  # this message already appeared before
         $more = 0;                             # which means that we should search no more
      }
   }
   print "Found: $counter posts.\n";

   if($counter < 100) {                         # if there are less than 100 new posts on the
      $more = 0;                                # SERP, this is the last page for this query
   }

   $result_page_counter += 100;                 # increase serp number
   sleep(int(rand(5)));                         # wait some time not to stress google too much
}
                                                # open file for
my $export_file = open(SAVE, "> message_ids_for_$author.txt") or die "could not save file: $!\n";
my $export_file2 = open(SAVE2, "> message_urls_for_$author.txt") or die "could not save file: $!\n";


                                                # retrieve "source" of all found messages
foreach my $google_hash (keys %groups_messages) {

                                                # uri contains group name and google's hash
                                                # identifier
   my $request_uri = 'http://groups.google.com/group/'.$groups_messages{$google_hash}{group}.
   '/msg/'.$google_hash.'?dmode=source&hl=de';

   my $response = $ua->get($request_uri);

   unless($response->code == 200) {
      $more = 0;
      die "Error! $request_uri\nHTTP-Status: ".$response->code."\n";
   }

   my $message_body = $response->content;

                                                # Check on google spyware detection
   if($message_body =~ /<title>403 Forbidden<\/title>/m) {
      $more = 0;
      die "Error! Google thinks this is malicious software.\nPlease try again later.\n";
   }
                                                # find Message-ID from the header
   $message_body =~ /<pre>.+Message-ID: &lt;(\S+)&gt;.+<\/pre>/s;
   $groups_messages{$google_hash}{msgid} = $1;
   print "http://groups.google.de/group/$groups_messages{$google_hash}{group}/msg/$google_hash\n";
   # check if message-ID is extracted
   if($1 == "") {
     # if not display message
     print "Message not found, probably already deleted...\n";
   } else {
     # else print message-ID and message-URL to each files.
     print "$1\n";
     print SAVE "$1\n";
     print SAVE2 "http://groups.google.de/group/$groups_messages{$google_hash}{group}/msg/$google_hash\n";
   }
   sleep(int(rand(5)));                         # wait some time not to stress google too much
}

close SAVE;
close SAVE2;