#!/usr/bin/perl -w # element: extract the contents of an element by element id # or class from a file or URL. # Don Marti # Will add a trailing newline if needed; use -n to suppress # References: # "Scanning HTML" by Sean M. Burke # http://www.foo.be/docs/tpj/issues/vol5_3/tpj0503-0008.html # perldoc HTML::TreeBuilder # perldoc HTML::Element use strict; use File::Slurp qw(slurp); use Getopt::Std; use HTML::TreeBuilder; use LWP::Simple qw(get); our($opt_n); getopts('n'); my $id = $ARGV[0]; # id (or failing that, class or name) to get my $document = $ARGV[1]; # filename or URL of the source die "Too many arguments\n" if scalar(@ARGV) > 2; die "Usage: $0 id document\n" if !$id or !$document; my $html = slurp($document, err_mode => 'quiet') || get($document); die "Can't get $document\n" if !defined($html); my $tree = HTML::TreeBuilder->new(); $tree->parse($html); $tree->eof; foreach my $attr qw(id class name _tag) { my $e = $tree->look_down($attr, $id); if (defined($e)) { print my $content = join '', map( ref($_) ? $_->as_HTML : $_, $e->content_list); print "\n" if !$opt_n and $content !~ /\n$/; last; } }