#!/usr/bin/perl

# stfw: Straight Through the Firewall

# stick this on your web site and all of a sudden you've got raw
# buck-naked Google Thoughtcrime happening on your IP address.
# See if they can block us all.  PS: works for Google Cache too.
# So put that in your pipe and smoke it.

##  If you're seeing this Perl script instead of your               ##
##  search result, either (1) you didn't supply a "q" parameter     ##
##  in your URL (try the form below) or (2) too many people are     ##
##  using this copy of this script.  Slow down, try back later, or  ##
##  save this script and send it to someone who has a web server    ##
##  that can see Google.                                            ##

# Copyright 2006, 2010 Don Marti <dmarti@zgp.org>

# This is free software. You can redistribute it
# and/or modify it under the same terms as Perl itself.

# You will probably want to change the name of the script, and put
# a basic search form on a page.  Something like this:

# <form id="search" action="/cgi-bin/stfw">
#  <input type="text" name="q">
#  <input type="submit" value="Search">
# </form>

# Set the "$FAIL_TIMEOUT" and "$SUCCESS_TIMEOUT" below to reduce the
# rate at which this script will bug Google.

# for background: 
#  http://rconversation.blogs.com/rconversation/2010/01/google-puts-its-foot-down.html

use strict;
use CGI;
use File::Slurp qw(slurp);
use HTML::Parser ();
use LWP::UserAgent;
use Redis;
use URI;
use URI::QueryParam;

my $FAIL_TIMEOUT = 1800; # how long to wait after a failed request
my $SUCCESS_TIMEOUT = 6; # how long to wait after a successful request

# Filter out tags that are less likely to leave traces
# on the user's computer, but no guarantees.  Need iframe
# for cached pages (?)
my %safe_tag =  map { $_ => 1 } qw {
    head body title h1 h2 h3
    table td th tr
    a img div iframe
    input form textarea
    u small b large em span nobr cite wbr p noscript br
    style link
    ol ul li 
    span/
};

# Same with attributes -- for example we don't want to fire the 
# JavaScript stuff from the on.* attributes.
my %safe_attr =  map { $_ => 1 } qw {
    action class
    colspan rowspan
    href id maxlength method name nowrap rel size style
    title type valign value width
};

# Allowed query parameters
my %safe_param = map { $_ => 1 } qw { hl q start };

# Set a URL that will lead back here for clicking through more
# result pages
my $q = new CGI;
my $here = $q->url(-relative=>1);

# keep track of when we're between tags that we don't want to pass on
my %in_sensitive_element;

# Start building the text to be passed back to the browser
my $clean = '';

# Some rate limiting...
my $wait;
my $redis = Redis->new;
my $when = $redis->get($here);
if ($redis->get($here) > time() or !defined($q->param('q'))) {
    print "Content-type: text/plain\n\n";
    print slurp($0);
    exit(0);
}

# and let's not mess up the browser usage stats, so pass on 
# the real user-agent (browser fanboys leave this alone please)
my $browser = LWP::UserAgent->new;
$browser->agent($q->http('User-agent'));

# set up a URI object for the outgoing search
my $url = URI->new("http://www.google.com/search");

foreach my $qp ($q->param) {
    $url->query_param($qp, $q->param($qp)) if $safe_param{$qp};
}

my $search = HTTP::Request->new(GET => $url);

# do the HTTP request
my $result = $browser->request($search);

# Hey, Google, don't get mad at us, we'll go away for a while
# if rate limited.
if ($result->is_success()) {
    $redis->set($here, time() + $SUCCESS_TIMEOUT);
} else {
    $redis->set($here, time() + $FAIL_TIMEOUT);
}

# make a parser object, parse out the stuff that's safe* to pass
# on, send it back to the user
my $p = 
  HTML::Parser->new(api_version => 3,
                    start_h     => [\&start, "tagname, attr, text"],
                    end_h       => [\&end, "tagname, text"],
                    default_h   => [\&text, "text"],
);
$p->parse($result->content);
print $q->header(), $clean;
exit(0);

# this is the callback that gets called on an open tag.
# try not to send sensitive stuff and do a couple special things
# with images and links.
sub start {
    my ($name, $attr, $text) = @_;

    if (total_sensitive() > 0) {
        return;
    }

    if (!exists($safe_tag{$name})) {
        # $in_sensitive_element{$name}++;
        # return;
    } elsif ($name eq 'img') {
        if(defined($attr->{'alt'})) {
            $clean .= $attr->{'alt'};
        }
        return;
    }

    if (defined($attr->{'href'})) {
        $attr->{'href'} =~ s|^(http://[\d\.]+)?/search|$here|;
    } elsif (defined($attr->{'action'})) {
        $attr->{'action'} =~ s|^/search|$here|;
    }

    foreach my $a (keys(%$attr)) {
        if (!exists($safe_attr{$a})) {
            delete($attr->{$a});
        }
    }

    $clean .= "<$name " . 
              join (' ', map{qq{$_="$attr->{$_}"}} keys(%$attr)) .
              ">";
}

# close tag.  Keep track of whether we're in something sensitive.
sub end {
    my ($name, $text) = @_;
    if ($in_sensitive_element{$name} > 0) {
        $in_sensitive_element{$name}--;
    } else {
        $clean .= $text;
    }
}

# build onto the result if we're not in a sensitive place
sub text {
    if (total_sensitive() == 0) {
        $clean .= shift;
    } 
}

sub total_sensitive {
    my $sum = 0;
    while ( my ($key, $value) = each (%in_sensitive_element)) {
        $sum += $value;
    }
    return($sum);
}
