package Treex::PML::Seq; use Carp; use warnings; use vars qw($VERSION); BEGIN { $VERSION='2.04'; # version template } use strict; use Treex::PML::List; use Treex::PML::Seq::Element; =head1 NAME Treex::PML::Seq - sequence of PML values of various types =head1 DESCRIPTION This class implements the data type 'sequence'. A sequence contains of zero or more elements (L<Treex::PML::Seq::Element>), each consisting of a name and value. The ordering of elements in a sequence may be constrained by a regular-expression-like pattern operating on element names. Validation of a sequence against this constraint pattern is not automatic but can be performed at any time on demand. =over 4 =item Treex::PML::Seq->new (element_array_ref?, content_pattern?,$reuse?) NOTE: Don't call this constructor directly, use Treex::PML::Factory->createSeq() instead! Create a new sequence (optionally populated with elements from a given array_ref). Each element should be a Treex::PML::Element::Seq object. The second optional argument is a regular expression constraint which can be stored in the object and used later for validating content (see validate() method below). The C<$reuse> argument is a boolean flag indicating whether the passed array reference can be used directly (if C<$reuse> is true) or copied (if C<$reuse> ise false). =cut sub new { my ($class,$array,$content_pattern,$reuse) = @_; $array = [] unless defined($array); return bless [Treex::PML::List->new_from_ref($array,$reuse), # a list consisting of [name,value] pairs $content_pattern # a content_pattern constraint ],$class; } =item $seq->elements ($name?) Return a list of [ name, value ] pairs representing the sequence elements. If the optional $name argument is given, select only elements whose name is $name. =cut sub elements { my ($self,$name)=@_; if (defined $name and $name ne '*') { return grep { $_->[0] eq $name } @{$_[0]->[0]}; } else { return @{$_[0]->[0]}; } } =item $seq->elements_list () Like C<elements> without a name, only this method returns directly the Treex::PML::List object associated with this sequence. =cut sub elements_list { return $_[0]->[0]; } =item $seq->content_pattern () Return the regular expression constraint stored in the sequence object (if any). =cut sub content_pattern { return $_[0]->[1]; } =item $seq->set_content_pattern () Store a regular expression constraint in the sequence object. This expression can be used later to validate sequence content (see validate() method). =cut sub set_content_pattern { $_[0]->[1] = $_[1]; } =item $seq->values (name?) If no name is given, return a list of values of all elements of the sequence. If a name is given, return a list consisting of values of elements with the given name. In array context, the returned value is a list, in scalar context the result is a Treex::PML::List object. =cut sub values { my ($self,$name)=@_; my @values = map { $_->[1] } ((defined($name) and length($name)) ? (grep $_->[0] eq $name, @{$self->[0]}) : @{$self->[0]}); return wantarray ? @values : bless \@values, 'Treex::PML::List'; #->new_from_ref(\@values,1); } =item $seq->names () Return a list of names of all elements of the sequence. In array context, the returned value is a list, in scalar context the result is a Treex::PML::List object. =cut sub names { my @names = map { $_->[0] } $_[0][0]->values; return wantarray ? @names : bless \@names, 'Treex::PML::List'; #Treex::PML::List->new_from_ref(\@names,1); } =item $seq->element_at (index) Return the element of the sequence on the position specified by a given index. Elements in the sequence are indexed as elements in Perl arrays, i.e. starting from $[, which defaults to 0 and nobody sane should ever want to change it. =cut sub element_at { my ($self, $index)=@_; return $self->[0][$index]; } =item $seq->name_at (index) Return the name of the element on a given position. =cut sub name_at { my ($self, $index)=@_; my $el = $self->[0][$index]; return $el->[0] if $el; } =item $seq->value_at (index) Return the value of the element on a given position. =cut sub value_at { my ($self, $index)=@_; my $el = $self->[0][$index]; return $el->[1] if $el; } =item $seq->delegate_names (key?) If all element values are HASH-references, then it is possible to store each element's name in its value under a given key (that is, to delegate the name to the HASH value). The default value for key is C<#name>. It is a fatal error to try to delegate names if some of the values is not a HASH reference. =cut sub delegate_names { my ($self,$key) = @_; $key = '#name' unless defined $key; if (grep { !UNIVERSAL::isa($_->[1],'HASH') } @{$self->[0]}) { croak("Error: sequence contains a non-HASH element (Treex::PML::Seq can only delegate names to values if all values are HASH refs)!"); } foreach my $element (@{$self->[0]}) { $element->[1]{$key} = $element->[0]; # store element's name in key $key of its value } } =item $seq->validate (content_pattern?) Check that content of the sequence satisfies a constraint specified by means of a regular expression C<content_pattern>. If no content_pattern is given, the one stored with the object is used (if any; otherwise undef is returned). Returns: 1 if the content satisfies the constraint, 0 otherwise. =cut sub validate { my ($self,$re) = @_; $re = $self->content_pattern if !defined($re); return unless defined $re; my $content = join "",map { "<$_>"} $self->names; $re=~s/\#/\\\#/g; $re=~s/,/ /g; $re=~s/\s+/ /g; $re=~s/([^()?+*|,\s]+)/(?:<$1>)/g; # warn "'$content' VERSUS /$re/\n"; return $content=~m/^$re$/x ? 1 : 0; } =item $seq->push_element (name, value) Append a given name-value pair to the sequence. =cut sub push_element { my ($self,$name,$value)=@_; push @{$self->[0]},Treex::PML::Seq::Element->new($name,$value); } =item $seq->push_element_obj (obj) Append a given Treex::PML::Seq::Element object to the sequence. =cut sub push_element_obj { my ($self,$obj)=@_; push @{$self->[0]},$obj; } =item $seq->unshift_element (name, value) Prepend a given name-value pair to the sequence. =cut sub unshift_element { my ($self,$name,$value)=@_; unshift @{$self->[0]},Treex::PML::Seq::Element->new($name,$value); } =item $seq->unshift_element_obj (obj) Unshift a given Treex::PML::Seq::Element object to the sequence. =cut sub unshift_element_obj { my ($self,$obj)=@_; unshift @{$self->[0]},$obj; } =item $seq->delete_element (element) Find and remove (all occurences) of a given Treex::PML::Seq::Element object in the sequence. Returns the number of elements removed. =cut =item $seq->delete_element (element) Find and remove (all occurences) of a given Treex::PML::Seq::Element object in the sequence. Returns the number of elements removed. =cut sub delete_element { my ($self,$element)=@_; my $start = @{$self->[0]}; @{$self->[0]} = grep { $_ != $element } @{$self->[0]}; my $end = @{$self->[0]}; return $start-$end; } =item $seq->delete_value (value) Find and remove all elements with a given value. Returns the number of elements removed. =cut sub delete_value { my ($self,$value)=@_; my $start = @{$self->[0]}; my $v; if (ref($value)) { @{$self->[0]} = grep { $v = $_->value; ref($v) and ($v != $value) } @{$self->[0]}; } else { @{$self->[0]} = grep { $v = $_->value; !ref($v) and ($v ne $value) } @{$self->[0]}; } my $end = @{$self->[0]}; return $start-$end; } =item $seq->index_of ($value) Search the sequence for a particular value and return the index of its first occurence in the sequence. Note: Use $seq->elements_list->index_of($element) to search for a Treex::PML::Seq::Element. =cut sub index_of { my ($self,$value)=@_; die 'Usage: Treex::PML::Seq->index_of($value) (wrong number of arguments!)' if @_!=2; my $list = $self->[0]; if (ref($value)) { my $v; for my $i (0..$#$list) { $v = $list->[$i]->value; return $i if ref($v) and $value == $v; } } else { my $v; for my $i (0..$#$list) { $v = $list->[$i]->value; return $i if !ref($v) and $value eq $v; } } return; } # sub splice { # # TODO # } # sub delete_element_at { # # TODO # } # sub store_element_at { # # TODO # } =item $list->empty () Remove all values from the sequence. =cut sub empty { die 'Usage: Treex::PML::Seq->empty() (wrong number of arguments!)' if @_!=1; my $self = shift; $self->[0]->empty; return $self; } =back =head1 AUXILIARY FUNCTIONS =over 5 =item Treex::PML::Seq::content_pattern2regexp($pattern) This utility function converts a given sequence content pattern string into a Perl regular expression. The resulting expression matches a list of element 'tags', where a tag is an element name surrounded by < and >. For example, the content pattern 'A,#TEXT,(B+|C)*' translates roughly to '<A><\#TEXT>(?:(?:<B>)+(?:<C>))*' and matches (a substring of) each of the following strings: '<A><#TEXT>' 'foo<A><#TEXT><B><B><C>bar' '<A><#TEXT><B><C><D>' =back =cut sub content_pattern2regexp { my ($re)=@_; $re=~s/[\${}\\]//g; # sanity $re=~s/\(\?//g; # safety $re=~s/\#/\\\#/g; $re=~s/,/ /g; $re=~s/\s+/ /g; $re=~s/([^()?+*|,\s]+)/(?:<$1>)/g; $re=~s/ //g; return $re; } =head1 SEE ALSO L<Treex::PML>, L<Treex::PML::Factory>, L<Treex::PML::Schema>, L<Treex::PML::Seq::Element>, L<Treex::PML::List> =head1 COPYRIGHT AND LICENSE Copyright (C) 2006-2010 by Petr Pajas This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.2 or, at your option, any later version of Perl 5 you may have available. =cut 1;