#!/usr/bin/perl -w

# element: extract the contents of an element by element id 
#          or class from a file or URL.
# Don Marti <dmarti@zgp.org>

# Will add a trailing newline if needed; use -n to suppress

# References:
# "Scanning HTML" by Sean M. Burke
#   http://www.foo.be/docs/tpj/issues/vol5_3/tpj0503-0008.html
# perldoc HTML::TreeBuilder
# perldoc HTML::Element

use strict;
use File::Slurp qw(slurp);
use Getopt::Std;
use HTML::TreeBuilder;
use LWP::Simple qw(get);

our($opt_n);
getopts('n');

my $id = $ARGV[0];       # id (or failing that, class or name) to get
my $document = $ARGV[1]; # filename or URL of the source

die "Too many arguments\n" if scalar(@ARGV) > 2;
die "Usage: $0 id document\n" if !$id or !$document;

my $html = slurp($document, err_mode => 'quiet') || get($document);
die "Can't get $document\n" if !defined($html);

my $tree = HTML::TreeBuilder->new();
$tree->parse($html);
$tree->eof;

foreach my $attr qw(id class name _tag) {
    my $e = $tree->look_down($attr, $id);
    if (defined($e)) {
        print my $content = join '', 
                      map( ref($_) ? $_->as_HTML : $_, $e->content_list);
        print "\n" if !$opt_n and $content !~ /\n$/;
        last;
    }
}