package HTML::DublinCore;
use strict;
use warnings;
use Carp qw( croak );
use base qw( DublinCore::Record HTML::Parser );
use DublinCore::Element;
our $VERSION = .4;
=head1 NAME
HTML::DublinCore - Extract Dublin Core metadata from HTML
=head1 SYNOPSIS
use HTML::DublinCore;
## pass HTML to constructor
my $dc = HTML::DublinCore->new( $html );
## get the title element and print it's content
my $title = $dc->element( 'Title' );
print "title: ", $title->content(), "\n";
## get the same title content in one step
print "title: ", $dc->element( 'Title' )->content(), "\n";
## list context will retrieve all of a particular element
foreach my $element ( $dc->element( 'Creator' ) ) {
print "creator: ",$element->content(),"\n";
}
## qualified dublin core
my $creation = $dc->element( 'Date.created' )->content();
=head1 DESCRIPTION
HTML::DublinCore is a module for easily extracting Dublin Core metadata
that is embedded in HTML documents. The Dublin Core is a small set of metadata
elements for describing information resources. Dublin Core is typically
stored in the E<lt>HEADE<gt> of and HTML document using the E<lt>METAE<gt> tag.
For more information on embedding DublinCore in HTML see RFC 2731
L<http://www.ietf.org/rfc/rfc2731>. For a definition of the
meaning of various Dublin Core elements please see
L<http://www.dublincore.org/documents/dces/>.
HTML::DublinCore actually extends Brian Cassidy's excellent DublinCore::Record
framework by adding some asHTML() methods, and a new constructor.
=head1 METHODS
=cut
## valid dublin core elements
=head2 new()
Constructor which you pass HTML content.
$dc = HTML::DublinCore->new( $html );
=cut
sub new {
my ( $class, $html ) = @_;
my $self = $class->SUPER::new;
bless $self, $class;
croak( "please supply string of HTML as argument to new()" ) if !$html;
$self->{ "DC_errors" } = [];
## initialize our parser, and parse
$self->init();
$self->parse( $html );
}
=head2 asHtml()
Serialize your Dublin Core metadata as HTML E<lt>METAE<gt> tags.
print $dc->asHtml();
=cut
sub asHtml {
my $self = shift;
my $html = '';
foreach my $element ( $self->elements ) {
$html .= $element->asHtml() . "\n";
}
return( $html );
}
=head1 TODO
=over 4
=item * More comprehensive tests.
=item * Handle HTML entities properly.
=item * Collect error messages so they can be reported out of the object.
=back
=head1 SEE ALSO
=over 4
=item * DublinCore::Record
=item * Dublin Core L<http://www.dublincore.org/>
=item * RFC 2731 L<http://www.ietf.org/rfc/rfc2731>
=item * HTML::Parser
=item * perl4lib L<http://perl4lib.perl.org/>
=back
=head1 AUTHORS
=over 4
=item * Ed Summers E<lt>ehs@pobox.comE<gt>
=item * Brian Cassidy E<lt>bricas@cpan.orgE<gt>
=back
=head1 COPYRIGHT AND LICENSE
Copyright 2004 by Ed Summers, Brian Cassidy
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
## start tag hander. This automatically gets called in new() when we
## parse HTML since HTML::DublinCore inherits from HTML::Parser.
sub start {
my ( $self, $tagname, $attr, $attrseq, $origtext ) = @_;
return if ( $tagname ne 'meta' );
## lowercase keys
my %attributes = map { lc($_) => $attr->{$_} } keys( %$attr );
## parse name attribute (eg. DC.Identifier.ISBN )
return( undef ) if ! exists( $attributes{ name } );
my ( $namespace, $element, $qualifier ) =
split /\./, lc( $attributes{ name } );
## ignore non-DublinCore data
return( undef ) if $namespace ne 'dc';
## make sure element is dublin core
if ( ! grep { $element } @DublinCore::Record::VALID_ELEMENTS ) {
$self->_error( "invalid element: $element found" );
return( undef );
}
## return if we don't have a content attribute
if ( ! exists( $attributes{ content } ) ) {
$self->_error( "element $element lacks content" );
return( undef );
}
## create a new HTML::DublinCore::Element object
my $dc = DublinCore::Element->new();
$dc->name( $element );
$dc->qualifier( $qualifier );
$dc->content( $attributes{ content } );
if ( exists( $attributes{ scheme } ) ) {
$dc->scheme( $attributes{ scheme } );
}
if ( exists( $attributes{ lang } ) ) {
$dc->language( $attributes{ lang } );
}
## stash it for later
$self->add( $dc );
}
sub _error {
my ( $self, $msg ) = @_;
push( @{ $self->{ DC_errors } }, $msg );
return( 1 );
}
# add in a method to write DC elements as HTML meta tags.
package DublinCore::Element;
sub asHtml {
my $self = shift;
my $name = ucfirst( $self->name() );
if ( $self->qualifier() ) { $name .= '.' . $self->qualifier(); }
my $content = $self->content();
my $scheme = $self->scheme();
my $lang = $self->language();
my $html = qq(<meta name="DC.$name" content="$content");
if ( $scheme ) {
$html .= qq( scheme="$scheme");
}
if ( $lang ) {
$html .= qq( lang="$lang");
}
$html .= '>';
return ( $html );
}
1;