package XML::SAX::RTF; require 5.005_62; use strict; use XML::SAX::Base; use vars qw( $VERSION @ISA @EXPORT @EXPORT_OK ); require Exporter; @ISA = qw( Exporter XML::SAX::Base ); @EXPORT = qw( Version ); $VERSION = '0.2'; sub Version { $VERSION; } our %features = ( DEBUG => 0, ); # # internal globals # my $file = ''; # name of file being parsed my $inbuf; # input buffer with RTF to be processed my $level; # element nesting level in result doc my @elements; # open element stack for result doc sub new { # # constructor # my $class = shift; my $obj = {@_}; my $self = bless( $obj, $class ); return $self; } sub parse_file { # # parse a document, one line at a time # my $self = shift; $file = shift; my $buf = ''; if( open( F, $file )) { while( ) { $buf .= $_; } close F; } $self->parse_string( $buf ); $file = ''; } sub parse_string { # # parse a string containing RTF # my $self = shift; $inbuf = shift; $level = 0; @elements = (); $self->_parse(); $self->_close_everything; } sub set_feature { # # set a parser feature # my( $self, $feature, $value ) = @_; if( exists( $features{ $feature })) { $features{ $feature } = $value; } else { $self->SUPER::set_feature( $feature, $value ); } } sub get_feature { # # query a parser feature # my( $self, $feature ) = @_; if( exists( $features{ $feature })) { return $features{ $feature }; } else { return $self->SUPER::get_feature( $feature ); } } my %paramcmds = # # commands with parameters to wrap # ( b => 'bold', deff => 'default-font', deflang => 'language', dy => 'day', edmins => 'minutes-edited', f => 'font', fcharset => 'charset', footery => 'footery', fprq => 'pitch', fs => 'font-size', headery => 'headery', hr => 'hour', id => 'id', keepn => 'keep-next', li => 'indent-left', margl => 'margin-left', margr => 'margin-right', min => 'min', mo => 'month', nofchars => 'number-chars', nofcharsws => 'number-nonspace-chars', nofpages => 'number-pages', nofwords => 'numver-words', nowidctlpar => 'nowidctlpar', pard => 'style-default', qc => 'align-center', qj => 'align-justify', ql => 'align-left', qr => 'align-right', ri => 'indent-right', rtf => 'rtf-version', sa => 'space-after', sb => 'space-before', sbasedon => 'style-base', sec => 'sec', sl => 'space-line', snext => 'style-next', vern => 'version', yr => 'year', ); my %params = # # commands that are parameters, to be wrapped # ( ascii => 'character-set', mac => 'character-set', pc => 'character-set', pca => 'character-set', fnil => 'family', froman => 'family', fswiss => 'family', fmodern => 'family', fscript => 'family', fdecor => 'family', ftech => 'family', fbidi => 'family', ftnil => 'type', fttruetype => 'type', ); my %groupnames = # # commands labelling groups # ( author => 'author', b => 'bold', buptim => 'time-backedup', category => 'category', colortbl => 'color-table', comment => 'comment', company => 'company', creatim => 'time-created', cs => 'char-style', edmins => 'minutes-edited', f => 'font', field => 'field', filetbl => 'file-table', fldinst => 'field-inst', fldrslt => 'field-result', footer => 'footer', footerf => 'footer-first', footerl => 'footer-left', footerr => 'footer-right', footnote => 'footnote', fonttbl => 'font-table', header => 'header', headerf => 'header-first', headerl => 'header-left', headerr => 'header-right', i => 'italic', info => 'info', keywords => 'keywords', listtables => 'list-tables', manager => 'manager', nofchars => 'number-chars', nofcharsws => 'number-nonspace-chars', nofpages => 'number-pages', nofwords => 'numver-words', operator => 'operator', pn => 'para-number', pnseclvl => 'pn-sec-level', pntext => 'pn-text', pntxta => 'pn-txta', pntxtb => 'pn-txtb', printim => 'time-printed', revtbl => 'rev-table', revtim => 'time-revised', s => 'para-style', title => 'title', subject => 'subject', stylesheet => 'stylesheet', ul => 'ul', vern => 'version', version => 'version', ); my %wraptext = # # situations where we want to wrap text in an element # ( font => 'name', 'para-style' => 'name', ); sub _parse { # # parse contents of the input buffer # my $self = shift; while( $inbuf ) { if( $inbuf =~ /^\{/ ) { $self->_handle_group(); } elsif( $inbuf =~ /^\}/ ) { $self->_parse_error() unless( $level > 0 ); return; } elsif( $inbuf =~ /^\\/ ) { $self->_handle_ctlword(); } else { $self->_handle_content(); } } } sub _handle_content { # # process character data # my $self = shift; my $curr = $self->_current_element; if( $inbuf =~ /([^\\\{\}]+)/ ) { my $data = $1; $inbuf = $'; if( exists( $wraptext{ $curr })) { $data =~ s/;//; $self->_indent_start_element( $wraptext{ $curr }); $self->_characters( $data ); $self->_end_element; } else { $self->_characters( $data ); } } else { } } sub _handle_ctlword { # # process a control word # my $self = shift; $inbuf =~ s/^\\//; if( $inbuf =~ /^([a-z]+)/ ) { my $command = $1; my $parameter; $inbuf = $'; if( $inbuf =~ /^(-?[0-9]+)/ ) { $parameter = $1; $inbuf = $'; } if( $inbuf =~ /^ / ) { $inbuf = $'; } $self->_command( $command, $parameter ); } elsif( $inbuf =~/([\\\{\}])/ ) { $self->_characters( $1 ); } elsif( $inbuf =~/([^a-z])/ ) { my $command = $1; $inbuf = $'; $self->_start_element( 'command', {'param' => $command} ); $self->_end_element; } else { parse_error(); } } sub _command { # # process a command # my( $self, $command, $param ) = @_; if( $command eq 'par' ) { $self->_end_element if( $self->_current_element eq 'para' ); $self->_indent_start_element( 'para' ); } elsif( exists( $paramcmds{$command} )) { $self->_indent_start_element( $paramcmds{ $command }); $self->_characters( $param ); $self->_end_element; } elsif( exists( $params{ $command })) { $self->_indent_start_element( $params{ $command }); $self->_characters( $command ); $self->_end_element; } elsif( defined( $param )) { $self->_start_element( $command, { param => $param }); $self->_end_element; } else { $self->_start_element( $command ); $self->_end_element; } } sub _handle_group { # # process a group # my $self = shift; $inbuf =~ s/^\{//; if( $level == 0 ) { $self->_start_element( 'rtfdoc' ); $self->_indent_start_element( 'header' ); } elsif(( $inbuf =~ /^\s*\\([a-z]+)/ and exists( $groupnames{$1} )) or( $inbuf =~ /^\s*\\\*\\([a-z]+)/ and exists( $groupnames{$1} ))) { $inbuf = $'; my $name = $groupnames{$1}; if( $name eq 'info' and $self->_current_element eq 'header' ) { $self->_indent_end_element; $self->_indent_start_element( 'document' ); $self->_indent_start_element( $name ); } elsif( $inbuf =~ /^(-?[0-9]+)/ ) { my $param = $1; $inbuf = $'; $self->_indent_start_element( $name, { number => $param }); } else { $self->_indent_start_element( $name ); } $inbuf = $' if( $inbuf =~ /^ / ); } elsif( $self->_current_element eq 'stylesheet' ) { $self->_indent_start_element( 'para-style' ); } else { $self->_indent_start_element( 'group', { level => $level }); } $self->_parse(); $inbuf =~ s/^\}//; $self->_indent_end_element; } sub _characters { # # clean up characters, call handler # my( $self, $data ) = @_; return unless( defined( $data )); $self->_debug( "CHARACTERS: [$data]", 3 ); $data = $self->_unprotect_chars( $data ); $data =~ s/&/&/g; $data =~ s//>/g; $data =~ s/\n//g; $self->SUPER::characters({ Data => $data }); } sub _newline { # # output a newline character # my $self = shift; $self->SUPER::characters({ Data => "\n" }); } sub _indent_start_element { # # start new element with indentation # my( $self, $name, $params ) = @_; $self->_newline; $self->_characters( ' ' x $level ); $self->_start_element( $name, $params ); } sub _indent_end_element { # # end an indented element # my $self = shift; $self->_newline; $self->_characters( ' ' x ( $level-1 )); $self->_end_element; } sub _start_element { # # generate element start event, push name onto stack # my( $self, $name, $atts ) = @_; $self->_debug( "START ELEMENT: $name", 3 ); $level ++; push( @elements, $name ); if( $atts ) { $self->SUPER::start_element({ Name => $name, Attributes => $atts }); } else { $self->SUPER::start_element({ Name => $name }); } } sub _end_element { # # generate element finished event, pop name from stack # my $self = shift; my $name = pop( @elements ); $self->_debug( "END ELEMENT: $name", 3 ); $level --; $self->SUPER::end_element({ Name => $name }); return $name; } sub _current_element { # # return name of current element on stack # my $self = shift; return $elements[ $#elements ]; } sub _inside { # # return true if current element or ancestor has given name # my $self = shift; my $name = shift; foreach( @elements ) { return 1 if( $name eq $_ ); } return 0; } sub _close_everything { # # close all open elements # my $self = shift; $self->_debug( "ENTER _close_everything", 2 ); while( $level ) { $self->_indent_end_element; } $self->_debug( "EXIT _close_everything", 2 ); } sub _protect_chars { # # escape special characters from parsing # my $self = shift; my $data = shift; $data =~ s/&/\001RTF-AMPERSAND\001/g; #$data =~ s/\\>/\001RTF-GREATER-THAN\001/g; #$data =~ s/\\= $level ) { print STDERR "XML::SAX::RTF DEBUG-$level> $message\n"; } } 1; __END__ ################################################################## =head1 XML::SAX::RTF XML::SAX::RTF - SAX Driver for Microsoft's Rich Text Format (RTF) =head1 SYNOPSIS use XML::SAX::ParserFactory; use XML::SAX::RTF; my $handler = new MyHandler; my $parser = XML::SAX::ParserFactory->parser( Handler => $handler ); $parser->parse_file( shift @ARGV ); package MyHandler; sub new { my $class = shift; my $self = {@_}; return bless( $self, $class ); } sub start_element { my( $self, $data ) = @_; print "<", $data->{Name}; if( exists( $data->{Attributes} )) { my %atts = %{$data->{Attributes}}; foreach my $att ( keys %atts ) { my $val = $atts{$att}; $val =~ s/\"/"/g; print " $att=\"$val\""; } } print ">"; } sub end_element { my( $self, $data ) = @_; print "{Name}, ">"; } sub characters { my( $self, $data ) = @_; print $data->{Data}; } =head1 DESCRIPTION This is a subclass of XML::SAX::Base which implements a SAX driver for RTF documentation. It generates XML that closely matches the structure of RTF, i.e. a set of paragraph types with text and inline tags inside. =head1 AUTHOR Erik Ray (eray@oreilly.com) =head1 MAINTAINER NEEDED The original author seems to have abandoned this module. If you find the module useful, please consider adopting it. Contact grantm@cpan.org for co-maintainer access. =head1 COPYRIGHT Copyright 2002 Erik Ray and O'Reilly & Associates Inc. =cut