#!/usr/bin/perl -Tw ################################################################### # # # sitemap-o-matic: build a sitemaps.org (formerly Google) Sitemap # # Copyright 2006, 2007 Don Marti # # # # This is free software. You can redistribute it # # and/or modify it under the same terms as Perl itself. # # # ################################################################### # version 0.3 -- 15 Sep 2007 -- updated schema from Google to # sitemaps.org. # version 0.2 -- 26 Apr 2006 -- no @ allowed in URLs # version 0.1 -- 17 Mar 2006 -- initial release require 5.008; # for proper UTF-8 handling in parser use Date::Format; use HTML::Parser; use LWP::RobotUA; use LWP::UserAgent; use URI::URL; use XML::Writer; use strict; =pod =head1 NAME sitemap-o-matic - build a sitemaps.org Sitemap =head1 SYNOPSIS sitemap-o-matic [URL] =head1 DESCRIPTION This script supports all tags of the Sitemaps specification, version 0.9, as seen on: http://sitemaps.org/protocol.php It obtains the information directly from the page, so you can keep it in one place. Tag Source of information ----------- ------------------------------------------- loc: requested or crawled URL lastmod: obtained from the Last-Modified HTTP header changefreq: tag the page with host(); my $myscheme = $todo[0]->scheme(); my $myport = $todo[0]->port(); my %done; my %linker; my $url; my $mymail; my $count = 0; my $robot; my $ua = LWP::UserAgent->new; my $parser = HTML::Parser->new(api_version => 3, start_h => [\&start, "tagname, attr"]); $parser->utf8_mode(1); my $writer = new XML::Writer( DATA_MODE => 1, DATA_INDENT => 2, ENCODING => 'utf-8'); while (@todo) { $url = shift(@todo); next if $done{$url}; if ($count > 50_000) { print STDERR "More than 50,000 pages on site.\n"; exit(1); } if ($count > $#ARGV and !$mymail) { print STDERR qq{No email address in mailto or .\n}; print STDERR "Add an address to enable crawling.\n"; print STDERR "$count pages fetched.\n"; exit(1); } my $req = HTTP::Request->new(GET => $url); my $result; if ($mymail) { if (!$robot) { $robot = LWP::RobotUA->new('sitemap-o-matic/0.3', $mymail); $robot->delay(0); undef($ua); } $result = $robot->request($req); } else { $result = $ua->request($req); } if ($result->is_error()) { $done{$url}{'status'} = 'error '. $result->code; } elsif ($result->content_type eq 'text/html' or $result->content_type eq 'application/xhtml+xml') { $parser->parse($result->content); $parser->eof(); if (defined($result->last_modified())) { ${done}{$url}{'lastmod'} = time2str("%Y-%m-%d", $result->last_modified()); } $done{$url}{'status'} = 'success'; } else { $done{$url}{'status'} = "MIME type " . $result->content_type; } $count++; } $mymail = $ENV{'LOGNAME'} if !$mymail; $writer->xmlDecl(); $writer->startTag( "urlset", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance", "xsi:schemaLocation" => "http://www.sitemaps.org/schemas/sitemap/0.9 " . "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd", "xmlns" => "http://www.sitemaps.org/schemas/sitemap/0.9" ); $writer->comment("Sitemap generated " . time2str("%Y-%m-%d", time()) . " for $mymail"); foreach my $page (sort keys(%done)) { if ($done{$page}{'status'} eq 'success') { $writer->startTag('url'); $writer->startTag('loc'); $writer->characters($page); $writer->endTag('loc'); foreach my $attr (qw(lastmod priority changefreq)) { if ($done{$page}{$attr}) { $writer->startTag($attr); $writer->characters($done{$page}{$attr}); $writer->endTag($attr); } } $writer->endTag('url'); } elsif ($done{$page}{'status'} =~ /^error/) { print STDERR "$done{$page}{'status'} $page "; print STDERR " linked from ", join(' ', keys(%{$linker{$page}})), "\n"; } } $writer->endTag('urlset'); $writer->end(); sub start { my($tagname, $attr) = @_; if (($tagname eq 'a' or $tagname eq 'link') and defined($attr->{'href'})) { my $target = new URI::URL($attr->{'href'}, $url)->abs->canonical(); $target->fragment(undef); $target->query(undef); if (($target->scheme() eq $myscheme and $target->port eq $myport and $target->host eq $myhost) and $target->as_string() !~ /\@/) { push(@todo, $target); $linker{$target}{$url} = 1; } if (defined($attr->{'rev'}) and $attr->{'rev'} eq 'made') { $mymail = $target->to; } elsif ($target->scheme() eq 'mailto' and !$mymail) { $mymail = $target->to; } } elsif ($tagname eq 'meta') { foreach my $metaname (qw(changefreq priority)) { if (defined($attr->{'name'}) and $attr->{'name'} eq $metaname) { $done{$url}{$metaname} = $attr->{'content'}; } } } }