#!/usr/bin/perl -Tw

###################################################################
#                                                                 #
# sitemap-o-matic: build a sitemaps.org (formerly Google) Sitemap # 
# Copyright 2006, 2007, 2009, 2010 Don Marti <dmarti@zgp.org>     #
#                                                                 #
# This is free software. You can redistribute it                  #
# and/or modify it under the same terms as Perl itself.           #
#                                                                 #
###################################################################

# version 0.5 -- 18 Sep 2010 -- Support <link rel="author">

# version 0.4 -- 30 Jun 2009 -- switch from URI::URL to just URI.

# version 0.3 -- 15 Sep 2007 -- updated schema from Google to
#                               sitemaps.org.

# version 0.2 -- 26 Apr 2006 -- no @ allowed in URLs

# version 0.1 -- 17 Mar 2006 -- initial release

require 5.008; # for proper UTF-8 handling in parser
use Date::Format;
use HTML::Parser;
use LWP::RobotUA;
use LWP::UserAgent;
use URI;
use XML::Writer;
use strict;

=pod

=head1 NAME sitemap-o-matic - build a sitemaps.org Sitemap

=head1 SYNOPSIS
  sitemap-o-matic [URL]

=head1 DESCRIPTION

 This script supports all tags of the 
 Sitemaps specification, version 0.9, as seen on:
 http://sitemaps.org/protocol.php

 It obtains the information directly from the page,
 so you can keep it in one place.

 Tag         Source of information
 ----------- -------------------------------------------
 loc:        requested or crawled URL
 lastmod:    obtained from the Last-Modified HTTP header
 changefreq: tag the page with <meta name="changefreq"
 priority:   tag the page with <meta name="priority"
 ----------- -------------------------------------------

 You can start this script with as many URLs on
 the command line as you like.  At least one of the
 URLs supplied on the command line must contain an
 email address in a mailto, link rel="author", or 
 link rev="made" tag.

 Since the Sitemap spec requires that all URLs in
 a sitemap be from the same host, URLs whose host
 and scheme do not match those of the first URL will
 be ignored.

=cut

# Set your email address here if you don't have it on the site
# my $mailto = 'joe@example.com';

if (!@ARGV) {
    print "Usage: $0 [URL] ...\n";
    exit(1);
}    

my @todo = map {URI->new($_)} @ARGV;
my $myhost = $todo[0]->host();
my $myscheme = $todo[0]->scheme();
my $myport = $todo[0]->port();
my %done;
my %linker;
my $url;
my $mymail;
my $count = 0;
my $robot;

my $ua = LWP::UserAgent->new;
my $parser = HTML::Parser->new(api_version => 3,
                               start_h     => [\&start, "tagname, attr"]);
$parser->utf8_mode(1);
my $writer = new XML::Writer( DATA_MODE => 1, DATA_INDENT => 2, 
                              ENCODING => 'utf-8'); 

while (@todo) {
    $url = shift(@todo);
    next if $done{$url};
    if ($count > 50_000) {
        print STDERR "More than 50,000 pages on site.\n";
        exit(1);
    }

    if ($count > $#ARGV and !$mymail) {
        print STDERR qq{No email address in mailto or <link rev="made">.\n};
        print STDERR "Add an address to enable crawling.\n";
        print STDERR "$count pages fetched.\n";
        exit(1);
    }

    my $req = HTTP::Request->new(GET => $url);
    my $result;
    if ($mymail) {
        if (!$robot) {
            $robot = LWP::RobotUA->new('sitemap-o-matic/0.3', $mymail);
            $robot->delay(0);
            undef($ua);
        }
        $result = $robot->request($req);
    }
    else {
        $result = $ua->request($req);
    }

    if ($result->is_error()) {
        print STDERR "error on $url\n";
        $done{$url}{'status'} = 'error '. $result->code;
    } elsif ($result->is_redirect()) {
         # print "redirected $url\n";
         $done{$url}{'status'} = 'redirected ';
    } elsif ($result->content_type eq 'text/html' or
           $result->content_type eq 'application/xhtml+xml') {
        $parser->parse($result->content);
        $parser->eof();
        if (defined($result->last_modified())) {
            ${done}{$url}{'lastmod'} = 
                time2str("%Y-%m-%d", $result->last_modified());
        }
        $done{$url}{'status'} = 'success';
    }
    else {
        $done{$url}{'status'} = "MIME type " . $result->content_type;
    }
    $count++;
}

$mymail = $ENV{'LOGNAME'} if !$mymail;

$writer->xmlDecl();
$writer->startTag( "urlset",
  "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
  "xsi:schemaLocation" => "http://www.sitemaps.org/schemas/sitemap/0.9 " .
               "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd",
  "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9"
);

$writer->comment("Sitemap generated " .  time2str("%Y-%m-%d", time()) .
                 " for $mymail");

foreach my $page (sort keys(%done)) {
    if ($done{$page}{'status'} eq 'success') {
        $writer->startTag('url');
        $writer->startTag('loc');
        $writer->characters($page);
        $writer->endTag('loc');
        foreach my $attr (qw(lastmod priority changefreq)) {
            if ($done{$page}{$attr}) {
                $writer->startTag($attr);
                $writer->characters($done{$page}{$attr});
                $writer->endTag($attr);
            }
        }
        $writer->endTag('url');
    }
    elsif ($done{$page}{'status'} =~ /^error/) {
        print STDERR "$done{$page}{'status'} $page ";
        print STDERR " linked from ", join(' ', keys(%{$linker{$page}})), "\n";
    }
}

$writer->endTag('urlset');
$writer->end();

sub start {
    my($tagname, $attr) = @_;
    if (($tagname eq 'a' or $tagname eq 'link')
        and defined($attr->{'href'})) {
        my $target = URI->new_abs($attr->{'href'}, $url)->canonical();
        # print $target, "\n";
        $target->fragment(undef);
        $target->query(undef);
        if (($target->scheme() eq $myscheme 
            and $target->port eq $myport
            and $target->host eq $myhost) and
            $target->as_string() !~ /\@/) {
            push(@todo, $target);
            $linker{$target}{$url} = 1;
        }
        if (defined($attr->{'rev'}) and $attr->{'rev'} eq 'made') {
            $mymail = $target->to;
        } elsif (defined($attr->{'rel'}) and $attr->{'rel'} eq 'author') {
            $mymail = $target->to;
        } elsif ($target->scheme() eq 'mailto' and !$mymail) {
            $mymail = $target->to;
        }
    }
    elsif ($tagname eq 'meta') {
        foreach my $metaname (qw(changefreq priority)) {
            if (defined($attr->{'name'}) and $attr->{'name'} eq $metaname) {
                $done{$url}{$metaname} = $attr->{'content'};
            }
        }
    }
}
