From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

{
use 5.008;
use strict;
no warnings qw(recursion);
use Scalar::Util qw(blessed);
use Carp;
BEGIN {
our $VERSION = '2.04'; # version template
}
use List::Util qw(first);
use Scalar::Util qw(weaken);
use Treex::PML::Instance::Common qw(:diagnostics :constants);
use Treex::PML::IO qw(open_uri close_uri rename_uri);
use Encode;
use constant {
XAT_TYPE => 0,
XAT_NAME => 1,
XAT_VALUE => 1,
XAT_NS => 2,
XAT_ATTRS => 3,
XAT_CHILDREN => 5,
XAT_LINE => 4,
};
our $STRICT =1;
our $XTC_FLAGS;
use vars qw( $HAVE_XS );
BEGIN {
if (!$ENV{PML_COMPILE_NO_XS} && eval {
import XML::CompactTree::XS;
$HAVE_XS = 1;
1;
}) {
# print STDERR "Using XML::CompactTree::XS\n" if $HAVE_XS;
$XTC_FLAGS = XML::CompactTree::XS::XCT_ATTRIBUTE_ARRAY()|
XML::CompactTree::XS::XCT_LINE_NUMBERS()|
XML::CompactTree::XS::XCT_IGNORE_COMMENTS();
} else {
import XML::CompactTree;
$XTC_FLAGS = XML::CompactTree::XCT_ATTRIBUTE_ARRAY()|
XML::CompactTree::XCT_LINE_NUMBERS()|
XML::CompactTree::XCT_IGNORE_COMMENTS();
$HAVE_XS = 0;
}
}
my (%handlers,%src,
%handler_cache,@handler_cache,
%schema_cache,@schema_cache
);
# TODO:
# - create one handler per cdata+format type
# - test inline schemas
our $CACHE_HANDLERS=1;
our $CACHE_SCHEMAS=1;
our $MAX_SCHEMA_CACHE_SIZE=50;
our $VALIDATE_CDATA=0;
our $VALIDATE_SEQUENCES=1;
our $BUILD_TREES = 1;
our $LOAD_REFFILES = 1;
our $KNIT = 1;
our $READER_OPTS = {
no_cdata => 1,
clean_namespaces => 1,
expand_entities => 1,
expand_xinclude => 1,
no_xinclude_nodes => 1,
};
require Treex::PML;
sub _get_handlers_cache_key {
my ($schema)=@_;
my $key="$schema"; $key=~s/.*=//; # strip class
return
[
$key,
join ',',
$key,
$VALIDATE_CDATA || 0,
$VALIDATE_SEQUENCES || 0,
$BUILD_TREES || 0,
$LOAD_REFFILES || 0,
$KNIT || 0,
$Treex::PML::Node::TYPE,
$Treex::PML::Node::lbrother,
$Treex::PML::Node::rbrother,
$Treex::PML::Node::parent,
$Treex::PML::Node::firstson,
];
}
sub _get_schema_cache_key {
my ($schema_file)=@_;
if ((blessed($schema_file) and $schema_file->isa('URI'))) { # assume URI
if (($schema_file->scheme||'') eq 'file') {
$schema_file = $schema_file->file
} else {
return '0 '.$schema_file;
}
}
if (-f $schema_file) {
my $mtime = (stat $schema_file)[9];
return $mtime.' '.$schema_file;
}
}
sub get_cached_schema {
my ($schema_file)=@_;
return unless defined $schema_file;
my $cached = $schema_cache{$schema_file};
if ($cached and $schema_cache[-1] ne $schema_file) {
# move the last retrieved schema to the end of the queue
@schema_cache = ((grep { $_ ne $schema_file } @schema_cache),$schema_file);
}
return $cached;
}
sub cache_schema {
my ($key,$schema)=@_;
push @schema_cache,$key;
$schema_cache{$key} = $schema;
if (@schema_cache > $MAX_SCHEMA_CACHE_SIZE) {
my $del = delete $schema_cache{ shift @schema_cache };
delete $handler_cache{ $del }; # delete also from the handler cache
@handler_cache = grep { $_->[0] ne $del } @handler_cache;
if (exists &Treex::PML::Instance::Writer::forget_schema) {
Treex::PML::Instance::Writer::forget_schema($schema);
}
}
}
sub get_cached_handlers {
my ($key)=@_;
my $subkey = $key->[1];
my $cached = $handler_cache{ $key->[0] }{ $subkey };
if ($cached and $handler_cache[-1][1] ne $subkey) {
# move the last retrieved schema to the end of the queue
@handler_cache = ((grep { $_->[1] ne $subkey } @handler_cache),$key);
}
return $cached;
}
sub cache_handlers {
my ($key,$handlers)=@_;
my $subkey = $key->[1];
push @handler_cache,$key;
$handler_cache{$key->[0]}{$subkey} = $handlers;
if (@handler_cache > $MAX_SCHEMA_CACHE_SIZE) {
my $del = shift @handler_cache;
delete $handler_cache{ $del->[0] }{ $del->[1] };
}
}
sub load {
my $ctxt = shift;
my $opts = shift;
if (ref($opts) ne 'HASH') {
croak("Usage: Treex::PML::Instance->load({option=>value,...})\n");
}
if (!ref($ctxt)) {
$ctxt = Treex::PML::Factory->createPMLInstance;
}
my $config = $opts->{config};
if ($config and ref(my $load_opts = $config->get_data('options/load'))) {
$opts = {%$load_opts, %$opts};
}
$Treex::PML::Instance::DEBUG = $config->get_data('options/debug') if (!$Treex::PML::Instance::DEBUG and $config and defined($config->get_data('options/debug')));
local $READER_OPTS = { %$READER_OPTS, %{$opts->{parser_options} || {}} };
if (exists $opts->{filename}) {
$ctxt->set_filename( $opts->{use_resources}
? Treex::PML::FindInResourcePaths($opts->{filename})
: $opts->{filename}
);
}
my $reader;
my $fh_to_close;
# print Dumper($opts),"\n";
if (defined $opts->{dom}) {
$reader = XML::LibXML::Reader->new(DOM => delete $opts->{dom}, %$READER_OPTS);
} elsif (defined $opts->{fh}) {
$reader = XML::LibXML::Reader->new(IO => $opts->{fh}, %$READER_OPTS,
URI => $ctxt->{'_filename'},
%$READER_OPTS
);
} elsif (defined $opts->{string}) {
$reader = XML::LibXML::Reader->new(string => $opts->{string}, %$READER_OPTS,
URI => $ctxt->{'_filename'},
%$READER_OPTS
);
} elsif (defined $ctxt->{_filename}) {
if ($ctxt->{_filename} eq '-') {
$reader = XML::LibXML::Reader->new(FD => \*STDIN,
%$READER_OPTS,
);
} else {
$fh_to_close = open_uri($ctxt->{_filename});
$reader = XML::LibXML::Reader->new(FD => $fh_to_close,
URI => $ctxt->{_filename},
%$READER_OPTS,
);
}
} else {
croak("Treex::PML::Instance->load: at least one of filename, fh, string, or dom arguments are required!");
}
eval {
# check NS
$reader->nextElement();
my @transform_map =
grep {
my $id = $_->{id};
if (defined($id) and length($id)) {
$_
} else {
warn(__PACKAGE__.": Skipping PML transform in ".$config->get_url." (required attribute id missing):".Dumper($_));
()
}
}
(eval {
($config and $config->get_root) ? $config->get_root->{transform_map}->values : ()
});
my $root_element = $reader->localName;
my $root_ns = $reader->namespaceURI || '';
if ($root_ns ne PML_NS
or grep { (($_->{ns}||'') eq PML_NS and ($_->{root}||'') eq $root_element) } @transform_map) {
if ($config and $config->get_root and eval { require XML::LibXSLT; 1 }) {
# TRANSFORM
$reader->preserveNode;
$reader->finish;
my $dom = $reader->document;
foreach my $transform (@transform_map) {
my $id = $transform->{'id'};
my ($in_xsl) = $transform->{in};
my $type = $in_xsl && $in_xsl->{'type'};
next unless ($type and $type =~ /^(?:xslt|perl|pipe|shell)$/);
my $test = $transform->{'test'};
_debug("Trying transformation rule '$id'");
if (($test or $transform->{ns} or $transform->{root})
and (!$transform->{ns} or $transform->{ns} eq $root_ns)
and (!$transform->{root} or $transform->{root} eq $root_element)
and !$test or eval { $dom->find($test) }) {
if ($type eq 'xslt') {
my $in_xsl_href = URI->new(Encode::encode_utf8($in_xsl->get_member('href')));
next unless $in_xsl_href;
_debug("Transforming to PML with XSLT '$in_xsl_href'");
$ctxt->{'_transform_id'} = $id;
my $params = $in_xsl->content;
my %params;
%params = map { $_->{'name'} => $_->value } $params->values if $params;
$in_xsl_href = Treex::PML::ResolvePath($config->{'_filename'}, $in_xsl_href, 1);
my $xslt = XML::LibXSLT->new;
my $in_xsl_parsed = $xslt->parse_stylesheet_file($in_xsl_href)
|| die("Cannot locate XSL stylesheet '$in_xsl_href' for transformation $id\n");
$dom = $in_xsl_parsed->transform($dom,%params);
$dom->setBaseURI($ctxt->{'_filename'}) if $dom and $dom->can('setBaseURI');
$dom->setURI($ctxt->{'_filename'}) if $dom and $dom->can('setURI');
$reader = XML::LibXML::Reader->new(DOM => $dom);
$reader->nextElement();
last;
} elsif ($type eq 'perl') {
my $code = $in_xsl->get_member('command');
next unless $code;
_debug("Transforming to PML with Perl code: $code");
$ctxt->{'_transform_id'} = $id;
my $params = $in_xsl->content;
my %params;
%params = map { $_->{'name'} => $_->value } $params->values if $params;
$dom = perl_transform($code, $dom, %params);
die("Perl-based transformation '$id' failed: $@") if $@;
die("Perl-based transformation didn't return a XML::LibXML::Document object!\n") unless
(blessed($dom) and $dom->isa('XML::LibXML::Document'));
$dom->setBaseURI($ctxt->{'_filename'}) if $dom and $dom->can('setBaseURI');
$dom->setURI($ctxt->{'_filename'}) if $dom and $dom->can('setURI');
$reader = XML::LibXML::Reader->new(DOM => $dom);
$reader->nextElement();
last;
} elsif ($type eq 'pipe' or $type eq 'shell') {
my $code = $in_xsl->get_member('command');
next unless $code;
_debug("Transforming to PML with $type code: $code");
$ctxt->{'_transform_id'} = $id;
my $params = $in_xsl->content;
my @params;
@params = grep {defined and length } map { $_->{'name'} => $_->value } $params->values if $params;
my $tmp_file_in;
if ($type eq 'pipe') {
(my $fh, $tmp_file_in) = File::Temp::tempfile();
$dom->toFH($fh);
close $fh;
} else {
push @params, $dom->URI;
}
my $tmp_file_out;
{
local *OLDIN;
local *OLDOUT;
open OLDOUT,"<&STDOUT";
open OLDIN,"<&STDIN";
if ($type eq 'pipe') {
open STDIN, '<', $tmp_file_in;
} else {
close STDIN;
}
(undef, $tmp_file_out) = File::Temp::tempfile();
open STDOUT, '>', $tmp_file_out;
system($code,@params);
unlink $tmp_file_in if $tmp_file_in;
open STDIN,"<&OLDIN";
open STDOUT,">&OLDOUT";
}
{
open(my $fh, '<', $tmp_file_out) or die("Failed to read output from pipe transformation: $code\n");
unlink $tmp_file_out if $tmp_file_out;
$reader = XML::LibXML::Reader->new(IO => $fh, URI => $ctxt->{'_filename'});
}
$reader->nextElement();
last;
}
} else {
_debug("failed");
}
}
}
if (($reader->namespaceURI||'') ne PML_NS) {
my $f = $ctxt->{'_filename'} || '';
die("Root element of '$f' isn't in PML namespace: '".($reader->localName()||'')."' ".($reader->namespaceURI()||''))
}
}
$ctxt->{_root} = read_header($ctxt,$reader,$opts);
my $schema = $ctxt->{'_schema'};
unless (ref($schema)) {
die("Instance doesn't provide PML schema!");
}
unless (length($schema->{version}||'')) {
die("PML Schema file ".$ctxt->{'_schema-url'}." does not specify version!");
}
if (index(SUPPORTED_PML_VERSIONS," ".$schema->{version}." ")<0) {
die("Unsupported PML Schema version ".$schema->{version}." in ".$ctxt->{'_schema-url'});
}
{
# preprocess the options selected_references and selected_keys:
# we map the reffile names to reffile id's
my $sel_knit = ($ctxt->{_selected_knits} =
$opts->{selected_knits});
my $sel_refs = ($ctxt->{_selected_references} =
$opts->{selected_references});
croak("Treex::PML::Instance->load: selected_knits must be a Hash ref!")
if defined($sel_knit) && ref($sel_knit) ne 'HASH';
croak("Treex::PML::Instance->load: selected_references must be a Hash ref!")
if defined($sel_refs) && ref($sel_refs) ne 'HASH';
($ctxt->{'_selected_knits_ids'},
$ctxt->{'_selected_references_ids'}) = map {
my $sel = $_;
my $ret = {
(defined($sel) ?
(map {
my $ids = $ctxt->{'_refnames'}->{$_};
my $val = $sel->{$_};
map { $_=>$val }
defined($ids) ? (ref($ids) ? @$ids : ($ids)) : ()
} keys %$sel) : ())
};
$ret
} ($sel_knit,$sel_refs);
}
$ctxt->read_reffiles({use_resources=>$opts->{use_resources}});
$ctxt->{'_no_read_trees'} = $opts->{no_trees};
local $BUILD_TREES = $opts->{no_trees} ? 0 : 1;
local $LOAD_REFFILES = $opts->{no_references} ? 0 : 1;
local $KNIT = $opts->{no_knit} ? 0 : $LOAD_REFFILES;
local $VALIDATE_CDATA =$opts->{validate_cdata} ? 1 : 0;
local $VALIDATE_SEQUENCES =$opts->{ignore_content_patterns} ? 0 : 1;
$ctxt->{'_id-hash'}={};
prepare_handlers($ctxt);
dump_handlers($ctxt) if $opts->{dump_handlers} or $ENV{PML_COMPILE_DUMP};
load_data($ctxt,$reader,$opts);
while ($reader->read) {
if ($reader->nodeType == XML_READER_TYPE_PROCESSING_INSTRUCTION) {
push @{$ctxt->{'_pi'}}, [ $reader->name,$reader->value ];
}
}
$handlers{'#initialize'}->($ctxt);
$ctxt->{_root} = $handlers{'#root'}->($ctxt->{_root});
};
($handlers{'#cleanup'}||sub{})->();
%handlers=();
close_uri($fh_to_close) if defined $fh_to_close;
die $@ if $@;
$ctxt->{'_parser'} = undef;
return $ctxt;
}
######################################################
# $ctxt
sub _reader_address {
my ($ctxt,$reader)=@_;
my $line_number=$reader->lineNumber;
return " at ".$ctxt->{'_filename'}." line ".$line_number."\n";
}
sub read_header {
my ($ctxt,$reader,$opts)=@_;
# manually extract the root node
my $root = [XML_READER_TYPE_ELEMENT,
$reader->localName,
undef,
];
# read root node attributes
$root->[XAT_LINE] = 0;
$root->[XAT_ATTRS] = readAttributes($reader);
my $found_head = 0;
while ($reader->read == 1) {
my $type = $reader->nodeType;
if ($type == XML_READER_TYPE_TEXT) { # no CDATA
die "Unexpected content of a root element preceding <head>"._reader_address($ctxt,$reader);
} elsif ($type == XML_READER_TYPE_ELEMENT) {
if ($reader->localName eq 'head' and $reader->namespaceURI eq PML_NS) {
# we have head!
$found_head = 1;
last;
} else {
die "Unexpected element '".$reader->name."' precedes PML header <head>"._reader_address($ctxt,$reader);
}
}
}
unless ($found_head) {
die "Did not find PML <head> element: the document '".$ctxt->{_filename}."' is not a PML instance!";
}
my (%references,%named_references);
while ($reader->read == 1) {
last if $reader->depth<=1;
my $type = $reader->nodeType;
if ($type == XML_READER_TYPE_ELEMENT and $reader->namespaceURI eq PML_NS) {
my $name = $reader->localName;
if ($name eq 'schema') {
if ($ctxt->{'_schema'}) {
warn "Multiple <schema> elements in a PML <head>!";
$reader->nextSibling || last;
redo;
}
# read schema here:
my %a = @{ readAttributes($reader) || [] };
my $schema_file = delete $a{href};
if (defined $schema_file and length $schema_file) {
$schema_file = URI->new(Encode::encode_utf8($schema_file));
# print "$schema_file\n";
$ctxt->{'_schema-url'} = $schema_file; # store the original URL, not the resolved one!
my $schema_path = Treex::PML::ResolvePath($ctxt->{'_filename'},$schema_file,1);
my $key = _get_schema_cache_key($schema_path);
if (!($ctxt->{'_schema'}=get_cached_schema($key))) {
# print "loading schema $schema_path\n";
$ctxt->{'_schema'} =
Treex::PML::Factory->createPMLSchema({
filename => $schema_path,
use_resources => 1,
revision_error =>
"Error: ".$ctxt->{'_filename'}." requires different revision of PML schema %f: %e\n",
%a, # revision_opts
});
cache_schema($key, $ctxt->{'_schema'}) if $CACHE_SCHEMAS;
}
} else {
# inline schema
$ctxt->{'_schema'} = Treex::PML::Factory->createPMLSchema({
reader=>$reader,
base_url => $ctxt->{'_filename'},
use_resources => 1,
revision_error =>
"Error: ".($ctxt->{'_filename'}||'document')." requires different revision of PML schema %f: %e\n",
%a, # revision_opts
});
}
} elsif ($name eq 'references') {
if ($reader->read) {
while ($reader->depth==3) {
if ($reader->localName eq 'reffile' and
$reader->namespaceURI eq PML_NS) {
my %a = @{ readAttributes($reader) || [] };
my ($id,$name,$href) = @a{qw(id name href)};
if (defined($id) and length($id) and
defined($href) and length($href)) {
if (defined $name and length $name) {
my $prev_ids = $named_references{ $name };
if (defined $prev_ids) {
if (ref($prev_ids)) {
push @$prev_ids,$id;
} else {
$named_references{ $name }=Treex::PML::Factory->createAlt([$prev_ids,$id],1);
}
} else {
$named_references{ $name } = $id;
}
}
# Encode: all filenames must(!) be bytes
$references{$id} = Treex::PML::ResolvePath
($ctxt->{'_filename'},
URI->new(Encode::encode_utf8($href)),
$opts->{use_resources});
# Resources are not used for non-readas references,
# though, they must be handled manually.
} else {
warn "Missing id or href attribute on a <reffile>: ignoring\n";
}
}
$reader->nextSibling || last;
}
}
}
}
}
$ctxt->{'_schema'} or
die "Did not find <schema> element in PML <head>: the document '".$ctxt->{_filename}."' is not a valid PML instance!";
$ctxt->{'_references'} = \%references;
$ctxt->{'_refnames'} = \%named_references;
return $root;
}
sub prepare_handlers {
my ($ctxt,$opts)=@_;
%handlers=();
my $schema = $ctxt->{'_schema'};
my $key=_get_handlers_cache_key($schema);
my $cached = get_cached_handlers($key);
if ($cached) {
%handlers= @$cached;
} else {
compile_schema($schema);
cache_handlers($key,[%handlers]) if $CACHE_HANDLERS;
}
}
sub dump_handlers {
my $dir = '.pml_compile.d';
(-d $dir) || mkdir($dir) || die "Can't dump to $dir: $!\n";
# print "created $dir\n";
for my $f (keys %src) {
my $dump_file = File::Spec->catfile($dir,$f);
open (my $fh, '>:utf8', $dump_file)
|| die "Can't write to $dump_file: $!\n";
my $sub = $src{$f};
$sub=~s/^\s*#line[^\n]*\n//;
print $fh ($sub);
close $fh;
}
}
sub load_data {
my ($ctxt,$reader)=@_;
my $root = $ctxt->{_root};
my ($children);
$reader->read if $reader->nodeType == XML_READER_TYPE_END_ELEMENT;
if ($HAVE_XS) {
my %ns;
$children = XML::CompactTree::XS::readLevelToPerl(
$reader,
$XTC_FLAGS,
\%ns
);
$root->[XAT_NS]=$ns{(PML_NS)} || -1;
} else {
my %ns;
$children = XML::CompactTree::readLevelToPerl(
$reader,
$XTC_FLAGS,
\%ns
);
$root->[XAT_NS]=$ns{(PML_NS)} || -1;
}
$root->[XAT_CHILDREN]=$children;
# print Dumper($root);
# print Dumper({references => $ctxt->{'_references'},
# refnames => $ctxt->{'_refnames'}});
return $root;
}
sub _set_trees_seq {
my ($ctxt,$type,$data)=@_;
$ctxt->{'_pml_trees_type'} = $type;
my $trees = $ctxt->{'_trees'} ||= Treex::PML::Factory->createList;
my $prolog = $ctxt->{'_pml_prolog'} ||= Treex::PML::Factory->createSeq;
my $epilog = $ctxt->{'_pml_epilog'} ||= Treex::PML::Factory->createSeq;
my $phase = 0; # prolog
foreach my $element (@$data) {
my $val = $element->[1];
if (UNIVERSAL::DOES::does($val,'Treex::PML::Node')) {
if ($phase == 0) {
$phase = 1;
}
if ($phase == 1) {
$val->{'#name'} = $element->[0]; # manually delegate_name on this element
push @$trees, $val;
} else {
$prolog->push_element_obj($element);
}
} else {
if ($phase == 1) {
$phase = 2; # start epilog
}
if ($phase == 0) {
$prolog->push_element_obj($element);
} else {
$epilog->push_element_obj($element);
}
}
}
}
sub readAttributes {
my ($r)=@_;
my @attrs;
my ($prefix,$name);
if ($r->moveToFirstAttribute==1) {
do {{
$prefix = $r->prefix;
$name = $r->localName;
push @attrs, ($name,$r->value) unless ($prefix and $prefix eq 'xmlns') or (!$prefix and $name eq 'xmlns');
}} while ($r->moveToNextAttribute==1);
$r->moveToElement;
}
\@attrs;
}
sub _paste_last_code {
my ($node,$prev,$p)=@_;
return qq`
#$node\->{'$Treex::PML::Node::rbrother'}=undef;
$prev\->{'$Treex::PML::Node::rbrother'}=$node;
weaken( $node\->{'$Treex::PML::Node::lbrother'} = $prev );
weaken( $node\->{'$Treex::PML::Node::parent'} = $p );
`;
}
sub _paste_first_code {
my ($node,$p)=@_;
return qq`
#$node\->{'$Treex::PML::Node::rbrother'}=undef;
#$node\->{'$Treex::PML::Node::lbrother'}=undef;
$p\->{'$Treex::PML::Node::firstson'}=$node;
weaken( $node\->{'$Treex::PML::Node::parent'} = $p );
`;
}
sub hash_id_code {
my ($key,$value)=@_;
return q`
for (`.$key.q`) {
if (defined and length) {
if (exists($ID_HASH->{$ID_PREFIX.$_}) and
$ID_HASH->{$ID_PREFIX.$_} != `.$value.q`) {
warn("Duplicated ID '$_'");
}
weaken( $ID_HASH->{$ID_PREFIX.$_} = `.$value.q` );
}
}`
}
sub _fix_id_member {
my ($decl)=@_;
return unless $decl;
my ($idM) = $decl->find_members_by_role('#ID');
if ($idM) {
# what follows is a hack fixing buggy PDT 2.0 schemas
my $cdecl = $idM->get_content_decl(1); # no_resolve
if ($cdecl and $cdecl->get_decl_type == PML_CDATA_DECL and $cdecl->get_format eq 'ID') {
$cdecl->set_format('PMLREF');
} elsif ($cdecl = $idM->get_content_decl()) {
if ($cdecl and $cdecl->get_decl_type == PML_CDATA_DECL and $cdecl->get_format eq 'ID') {
warn "Trying to knit object of type '".$decl->get_decl_path."' which has an #ID-attribute ".
"'".$idM->get_name."' declared as <cdata format=\"ID\"/>. ".
"Note that the data-type for #ID-attributes in objects knitted as DOM should be ".
"<cdata format=\"PML\"/> (Hint: redeclare with <derive> for imported types).";
}
}
}
return $idM;
}
sub knit_code {
my ($decl,$assign,$fail)=@_;
my $sub = q`
if ($ref) {
$ref =~ s/^(?:(.*?)\#)//;
my $file_id = $1||'';
my $do_knit=$selected_knits->{$file_id};
unless (defined($do_knit) and $do_knit==0) {
my $target;
if (length $file_id) {
my $f = $parsed_reffile->{ $file_id };
if (ref $f) {
if (UNIVERSAL::DOES::does($f,'Treex::PML::Instance')) {
$target = $f->{'_id-hash'}->{$ref};
$target->{'#knit_prefix'}=$file_id;
} else { # DOM`;
if ($decl) {
my $idM = _fix_id_member($decl);
my $idM_name = $idM && $idM->get_name;
my $decl_path = $decl->get_decl_path; $decl_path =~ s/^!//;
$sub .= q`
my $dom_node = $ref_index->{$file_id}{$ref} || $f->getElementsById($ref);
if (defined $dom_node) {
$target = $ID_HASH->{$ID_PREFIX.$file_id.'#'.$ref};
if (!defined $target) {
my $p = $ID_PREFIX;
$ID_PREFIX.=$file_id.'#';
my $r = XML::LibXML::Reader->new(string=>'<f xmlns="`.PML_NS.q`">'.$dom_node->toString.'</f>');
$r->nextElement;
# print $r, $dom_node->toString,"\n";
my %ns;
my $tree = XML::CompactTree`.($HAVE_XS ? '::XS' : '').q`::readSubtreeToPerl($r,`.$XTC_FLAGS.q`,\%ns);
my $index = $pml_ns_index;
$pml_ns_index = $ns{'`.PML_NS.q`'} || -1;
# print "index: $pml_ns_index\n";
# print Dumper($tree->[0][XAT_CHILDREN][0]);
$target = $handlers{'`.$decl_path.q`'}->($tree->[XAT_CHILDREN][0]);`;
if ($idM) {
$sub .= q`
$target->{`.$idM_name.q`}=$file_id.'#'.$target->{`.$idM_name.q`} if $target;`;
}
$sub .= q`
$pml_ns_index = $index;
$weaken=0;
$ID_PREFIX=$p;
}
}`;
} else {
$sub .= q`
warn("DOM knit error: knit content type not declared in the schema!\n");`;
}
$sub.=q`
}
} else {
warn("warning: KNIT failed: document '$file_id' not loaded\n");
}
} else {
$target = $ID_HASH->{$ID_PREFIX.$ref};
}
if (ref $target) {`.$assign.q`
} else {
warn("warning: KNIT failed: ID $ref not found in reffile '$file_id'\n");`.$fail.q`
}
}
}
`;
return $sub;
}
sub _report_error {
my ($err)=@_;
if ($STRICT) {die $err} else {warn $err};
}
sub _unhandled {
my ($what,$pml_file,$el,$path)=@_;
_report_error( "Error: $what not declared for type '$path' at ".$pml_file." line ".$el->[XAT_LINE] );
return sub{};
}
sub compile_schema {
my ($schema)=@_;
my $schema_name = $schema->get_root_decl->get_name;
my ($ctxt,$pml_file,$pml_ns_index,$ID_HASH,$ID_PREFIX,$selected_knits,$ref_index,$parsed_reffile,$trees_type,$have_trees);
$handlers{'#cleanup'}= sub {
undef $_ for ($ctxt,$pml_file,$pml_ns_index,$ID_HASH,$ID_PREFIX,$selected_knits,$ref_index,$parsed_reffile);
};
$handlers{'#initialize'}= sub {
my ($instance)=@_;
$ctxt = $instance;
$pml_file = $instance->{'_filename'};
$pml_ns_index = $instance->{_root}->[XAT_NS];
$selected_knits = $instance->{_selected_knits_ids};
$ref_index = $instance->{'_ref-index'};
$ID_HASH = $instance->{'_id-hash'};
$ID_PREFIX = $instance->{'_id_prefix'} || '';
$parsed_reffile=$instance->{'_ref'};
$have_trees = 0;
};
$schema->for_each_decl(sub {
my ($decl)=@_;
# no warnings 'uninitialized';
my $decl_type=$decl->get_decl_type;
my $path = $decl->get_decl_path;
$path =~ s/^!// if $path;
return if $decl_type == PML_ATTRIBUTE_DECL ||
$decl_type == PML_MEMBER_DECL ||
$decl_type == PML_TYPE_DECL ||
$decl_type == PML_ELEMENT_DECL;
if ($decl_type == PML_ROOT_DECL) {
my $name = $decl->get_name;
my $cpath = $decl->get_content_decl->get_decl_path;
$cpath =~ s/^!//;
my $src = $schema_name.'__generated_read_root';
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
unless (ref($p) and
$p->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and
$p->[XAT_NS] == $pml_ns_index and
$p->[XAT_NAME] eq '`.$name.q`'
) {
die q(Did not find expected root element '`.$name.q` in ').$pml_file;
}
return ($handlers{ '`.$cpath.q`' })->($p);
}`;
$src{$src}=$sub;
$handlers{'#root'}=eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_STRUCTURE_DECL) {
# print $path,"\n";
my $src = $schema_name.'__generated_read_structure@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $a=$p->[XAT_ATTRS];
my $c=$p->[XAT_CHILDREN];
# print join(",",map {defined($_) ? $_ : 'undef'} $p->[XAT_NAME],$p->[XAT_LINE],@$p)."\n";
my (%s,$k,$v);`;
if ($VALIDATE_CDATA) {
$sub .= q`
if ($a) {
while (@$a) {
$k=shift @$a;
$v=shift @$a;
$s{ $k } = ($handlers{ '`.$path.q`/'.$k }||_unhandled("attribute member '$k'",$pml_file,$p,'`.$path.q`'))->( $v );
}
}`;
} else {
$sub .= q`
%s = @$a if $a;`;
}
$sub .= q`
if ($c) {
for my $el (@$c) {
unless (ref($el) and $el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT
and $el->[XAT_NS] == $pml_ns_index) {
if (!ref($el) || $el->[XAT_TYPE] == XML_READER_TYPE_TEXT || $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
warn q(Ignoring unexpected text content ').$el->[XAT_VALUE].q(' in a structure '`.$path.q`');
} elsif ($el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT) {
warn q(Ignoring unexpected element ').$el->[XAT_NAME].q(' in a structure '`.$path.q`');
}
next;
}
$k = $el->[XAT_NAME];
$s{ $k } = ($handlers{ '`.$path.q`/'.$k }||_unhandled("member '$k'",$pml_file,$el,'`.$path.q`'))->($el);
}
}`;
my ($id, $children_member);
for my $member ($decl->get_members) {
my $mdecl = $member->get_content_decl;
if ($member->is_required) {
my $name = $member->get_name;
if ($mdecl && $mdecl->get_role eq '#TREES') {
# this is a bit of a hack:
# in this case, if the trees have been read from the member, the member handler returns
# a stub value '#TREES' that will get deleted
$sub.=q`
ref or ($_ eq '#TREES' and delete($s{'`.$name.q`'})) or warn q(Missing required member '`.$name.q`' in structure '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE] for $s{'`.$name.q`'};`;
} else {
$sub.=q`
ref or defined and length or warn q(Missing required member '`.$name.q`' in structure '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE] for $s{'`.$name.q`'};`;
}
} elsif ($mdecl and $mdecl->get_decl_type == PML_CONSTANT_DECL) {
$sub.=q`
defined or $_="`.quotemeta($mdecl->{value}).q`" for $s{'`.$member->get_name.q`'};`;
}
my $role = $member->get_role;
if ($KNIT and !$role) {
$mdecl ||= $member->get_content_decl;
if ($mdecl and $mdecl->get_decl_type == PML_LIST_DECL and
$mdecl->get_role eq '#KNIT') {
my $mname = $member->get_name;
my $knit_name = $mname; $knit_name=~s/\.rf$//;
# warn("#KNIT on list not yet implemented: ".$member->get_name."\n");
$sub .=q`
my $ref_list = $s{'`.$mname.q`'};
if ($ref_list) {
my (@knit_list,@weaken,$weaken);
for my $ref (@$ref_list) {
$weaken=1;`
.knit_code($mdecl->get_knit_content_decl(),q`
push @knit_list, $target;
push @weaken, $weaken;`,
q`undef $ref_list; last;`)
.q`
}
if (defined $ref_list) {
my $i=0;
for (@knit_list) {
weaken($_) if $weaken[$i++];
}
$s{'`.$knit_name.q`'}=Treex::PML::Factory->createList(\@knit_list);`;
if ($mname ne $knit_name) {
$sub .= q`delete $s{'`.$mname.q`'};`;
}
$sub .= q`
} else {
warn("KNIT failed on list '`.$mname.q`'");
}
}`;
next;
}
}
if ($role eq '#ID') {
$id = $member->get_name;
} elsif (!$trees_type and $role eq '#TREES' and $BUILD_TREES) {
$mdecl ||= $member->get_content_decl;
my $mtype = $mdecl->get_decl_type;
if ($mtype == PML_LIST_DECL) {
# check that content type is of role #NODE
my $cmdecl = $mdecl->get_content_decl;
my $cmdecl_type = $cmdecl->get_decl_type;
unless ($cmdecl && ($cmdecl->get_role||'') eq '#NODE' &&
($cmdecl_type == PML_STRUCTURE_DECL or
$cmdecl_type == PML_CONTAINER_DECL)) {
_report_error("List '$path' with role #TREES may only contain structures or containers with role #NODE in schema ".
$decl->get_schema->get_url."\n");
}
$trees_type = $mdecl;
$sub .= q`
unless ($have_trees) {
$ctxt->{'_pml_trees_type'} = $trees_type;
$have_trees=1;
$ctxt->{'_trees'} = delete $s{'`.$member->get_name.q`'};
}`;
} elsif ($mtype == PML_SEQUENCE_DECL) {
$trees_type = $mdecl;
$sub .= q`
unless ($have_trees) {
$have_trees=1;
defined($_) && _set_trees_seq($ctxt,$trees_type,$_->elements_list) for (delete $s{'`.$member->get_name.q`'});
}`;
} else {
_report_error("#TREES member '$path/".$member->get_name."' is neither a list nor a sequence in schema ".$member->get_schema->get_url."\n");
}
} elsif ($role eq '#CHILDNODES') {
if ($children_member) {
_report_error("#CHILDNODES role defined on multiple members of type '$path': '$children_member' and '".$member->get_name."' in schema ".$member->get_schema->get_url."\n");
} else {
$children_member=$member->get_name;
}
} elsif ($role eq '#KNIT' and $KNIT) {
my $mname = $member->get_name;
my $knit_name = $mname; $knit_name=~s/\.rf$//;
$sub .= q`
my $ref = $s{'`.$mname.q`'}; my $weaken = 1;`
.knit_code($member->get_knit_content_decl,q`
if ($weaken) {
weaken( $s{'`.$knit_name.q`'}=$target );
} else {
$s{'`.$knit_name.q`'}=$target;
} `.
($mname ne $knit_name ? q`delete $s{'`.$mname.q`'};` : ''), '');
}
}
if ($decl->get_role eq '#NODE' and $BUILD_TREES) {
$sub .= q`
my $node = Treex::PML::Factory->createTypedNode($decl,\%s,1);
# my $node = bless \%s, 'Treex::PML::Node';
# $node->{`.$Treex::PML::Node::TYPE.q`}=$decl;`;
if ($children_member) {
my $cdecl = $decl->get_member_by_name($children_member)->get_content_decl;
my $ctype = $cdecl->get_decl_type;
if ($ctype == PML_LIST_DECL) {
my $cmdecl = $cdecl->get_content_decl;
my $cmdecl_type = $cmdecl->get_decl_type;
unless ($cmdecl->get_role eq '#NODE' &&
($cmdecl_type == PML_STRUCTURE_DECL or
$cmdecl_type == PML_CONTAINER_DECL)) {
_report_error("List '$path' with role #CHILDNODES may only contain structures or containers with role #NODE in schema '".
$decl->get_schema->get_url."'; got ".$cmdecl->get_decl_type_str." (".$cmdecl->get_decl_path.") with role '".$cmdecl->get_role."' instead!\n");
}
$sub .= q`
my $content = delete $node->{'`.$children_member.q`'};
if ($content) {
my $prev;
foreach my $son (@{ $content }) {
if ($prev) {
`._paste_last_code(qw($son $prev $node)).q`
} else {
`._paste_first_code(qw($son $node)).q`
}
$prev = $son;
}
}`;
} elsif ($ctype == PML_SEQUENCE_DECL) {
for my $edecl ($cdecl->get_elements) {
my $cmdecl = $edecl->get_content_decl;
my $cmdecl_type = $cmdecl->get_decl_type;
unless ($cmdecl->get_role eq '#NODE' &&
($cmdecl_type == PML_STRUCTURE_DECL or
$cmdecl_type == PML_CONTAINER_DECL)) {
_report_error("Sequence '$path' with role #CHILDNODES may only contain structures or containers with role #NODE in schema '".
$decl->get_schema->get_url."'; got ".$cmdecl->get_decl_type_str." (".$cmdecl->get_decl_path.") with role '".$cmdecl->get_role."' instead!\n");
}
}
$sub .= q`
my $content = delete $node->{'`.$children_member.q`'};
if ($content) {
# $content->delegate_names('#name');
foreach my $element (@{$content->[0]}) { # manually delegate
$element->[1]{'#name'} = $element->[0]; # store element's name in key $key of its value
}
my $prev;
foreach my $son (map $_->[1], @{$content->[0]}) { # $content->values
if ($prev) {
`._paste_last_code(qw($son $prev $node)).q`
} else {
`._paste_first_code(qw($son $node)).q`
}
$prev = $son;
}
}`;
} else {
_report_error("Role #CHILDNODES can only occur on a structure member of type list or sequence, not on ".$cdecl->get_decl_type_str." '$path' in schema ".$cdecl->get_schema->get_url."\n");
}
}
} else {
$sub.=q`
my $node = Treex::PML::Factory->createStructure(\%s,1);
# my $node = bless \%s, 'Treex::PML::Struct';
`;
}
if (defined $id) {
$sub.=hash_id_code(qq(\$s{'$id'}),'$node');
}
$sub.=q`
return $node;
}`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval($sub); die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_CONTAINER_DECL) {
my %attributes;
@attributes{ map $_->get_name, $decl->get_attributes } = ();
my $cdecl = $decl->get_content_decl;
my $cpath = $cdecl && $cdecl->get_decl_path;
$cpath=~s/^!// if $cpath;
my $src = $schema_name.'__generated_read_container@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $a=$p->[XAT_ATTRS];
my $c=$p->[XAT_CHILDREN];
my (%s,$k,$v,$content,@a_rest);
if ($a) {
while (@$a) {
$k=shift @$a;
$v=shift @$a;
if (exists $attributes{$k}) {`;
if ($VALIDATE_CDATA) {
$sub .= q`
$s{ $k } = ($handlers{ '`.$path.q`/'.$k }||_unhandled("attribute '$k'",$pml_file,$p,'`.$path.q`'))->( $v );`;
} else {
$sub .= q`
$s{ $k } = $v;`;
}
$sub .= q`
} else {
push @a_rest, $k, $v;
}
}
}
$p->[XAT_ATTRS]=\@a_rest;`;
if ($cdecl) {
$sub .= q`
$content = $handlers{ '`.$cpath.q`' }->($p);`;
} else {
$sub .= q`
!$c or !grep { !($_->[XAT_TYPE] == XML_READER_TYPE_WHITESPACE or $_->[XAT_TYPE] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) } @$c or _report_error(qq(Unexpected content of an empty container type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);`;
}
my $id;
for my $member ($decl->get_attributes) {
if ($member->is_required) {
my $name = $member->get_name;
$sub.=q`
ref or defined and length or _report_error(q(missing required attribute '`.$name.q`' in container '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]) for $s{'`.$name.q`'};`;
}
if ($member->get_role eq '#ID') {
$id = $member->get_name;
}
}
if ($decl->get_role eq '#NODE' and $BUILD_TREES) {
$sub .= q`
my $node = Treex::PML::Factory->createTypedNode($decl,\%s,1);
# my $node = bless \%s, 'FSNode';
# $node->{`.$Treex::PML::Node::TYPE.q`}=$decl;`;
if ($cdecl and ($cdecl->get_role||'') eq '#CHILDNODES') {
my $ctype = $cdecl->get_decl_type;
if ($ctype == PML_LIST_DECL) {
my $cmdecl = $cdecl->get_content_decl;
my $cmdecl_type = $cmdecl->get_decl_type;
unless ($cmdecl->get_role eq '#NODE' &&
($cmdecl_type == PML_STRUCTURE_DECL or
$cmdecl_type == PML_CONTAINER_DECL)) {
_report_error("List '$path' with role #CHILDNODES may only contain structures or containers with role #NODE in schema '".
$decl->get_schema->get_url."'; got ".$cmdecl->get_decl_type_str." (".$cmdecl->get_decl_path.") with role '".$cmdecl->get_role."' instead!\n");
}
$sub .= q`
if ($content) {
my $prev;
foreach my $son (@{ $content }) {
if ($prev) {
`._paste_last_code(qw($son $prev $node)).q`
} else {
`._paste_first_code(qw($son $node)).q`
}
$prev = $son;
}
}`;
} elsif ($ctype == PML_SEQUENCE_DECL) {
for my $edecl ($cdecl->get_elements) {
my $cmdecl = $edecl->get_content_decl or
_report_error("Element '".$edecl->get_name."' of sequence '$path' has no content type declaration");
my $cmdecl_type = $cmdecl->get_decl_type;
unless ($cmdecl->get_role eq '#NODE' &&
($cmdecl_type == PML_STRUCTURE_DECL or
$cmdecl_type == PML_CONTAINER_DECL)) {
_report_error("Sequence '$path' with role #CHILDNODES may only contain structures or containers with role #NODE in schema '".
$decl->get_schema->get_url."'; got ".$cmdecl->get_decl_type_str." (".$cmdecl->get_decl_path.") with role '".$cmdecl->get_role."' instead!\n");
}
}
$sub .= q`
if ($content) {
# $content->delegate_names('#name');
foreach my $element (@{$content->[0]}) { # manually delegate
$element->[1]{'#name'} = $element->[0]; # store element's name in key $key of its value
}
my $prev;
foreach my $son (map $_->[1], @{$content->[0]}) { # $content->values
if ($prev) {
`._paste_last_code(qw($son $prev $node)).q`
} else {
`._paste_first_code(qw($son $node)).q`
}
$prev = $son;
}
}`;
} else {
_report_error("Role #CHILDNODES can only occur on a container content type if it is a list or sequence, not on a ".$cdecl->get_decl_type_str." '".$path."' in schema ".$cdecl->get_schema->get_url."\n");
}
} elsif ($cdecl) {
$sub .= q`
$node->{'#content'} = $content if $content;`;
}
} else {
$sub.=q`
my $node = Treex::PML::Factory->createContainer($content,\%s,1);
# $s{'#content'}=$content if $content;
# my $node = bless \%s, 'Treex::PML::Container';`;
}
if (defined $id) {
$sub.=hash_id_code(qq(\$s{'$id'}),'$node');
}
$sub.=q`
return $node;
}`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval($sub); die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_SEQUENCE_DECL) {
my $src = $schema_name.'__generated_read_sequence@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $c=$p->[XAT_CHILDREN];
return undef unless $c and @$c;
my @seq;
my $k;
for my $el (@$c) {
if (ref($el) and $el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT
and $el->[XAT_NS] == $pml_ns_index) {
# print "element: $el->[XAT_NAME]\n";
$k = $el->[XAT_NAME];
push @seq, bless [$k, ($handlers{ '`.$path.q`/'.$k }||_unhandled("element '$k'",$pml_file,$el,'`.$path.q`'))->($el)], 'Treex::PML::Seq::Element';`;
if ($decl->is_mixed) {
$sub .= q`
} elsif (!ref($el)) {`;
$sub .= q`
push @seq, bless ['#TEXT',$el], 'Treex::PML::Seq::Element';
} elsif ($el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA
or $el->[XAT_TYPE] == XML_READER_TYPE_WHITESPACE or $el->[XAT_TYPE] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
push @seq, bless ['#TEXT',$el->[XAT_VALUE]], 'Treex::PML::Seq::Element';
}`;
} else {
$sub .= q`
} elsif (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
_report_error(q(Unexpected text content in a non-mixed sequence '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}`;
}
$sub .= q`
}`;
my $content_pattern = $decl->get_content_pattern;
if ($VALIDATE_SEQUENCES and $content_pattern) {
my $re = Treex::PML::Seq::content_pattern2regexp($content_pattern);
$sub .= q`
unless (join('',map '<'.$_->[0].'>',@seq) =~ m{^`.$re.q`$}ox) {
warn("Sequence content (".join(",",map $_->[0], @seq).") does not follow the pattern `.quotemeta($content_pattern).q` in ".$pml_file.' line '.$p->[XAT_LINE]);
}`;
}
if (!$trees_type and $decl->get_role eq '#TREES' and $BUILD_TREES) {
$trees_type = $decl;
$sub .= q`
unless ($have_trees) {
$have_trees=1;
_set_trees_seq($ctxt,$trees_type,\@seq);
return;
}`;
}
if ($content_pattern) {
$sub .= q`
return Treex::PML::Factory->createSeq(\@seq, "`.quotemeta($content_pattern).q`",1);
}`;
} else {
$sub .= q`
return Treex::PML::Factory->createSeq(\@seq, undef, 1);
}`;
}
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_LIST_DECL) {
# print $path."\t@".$decl->get_decl_type_str,"\n";
my $cdecl = $decl->get_content_decl
or croak("Invalid PML Schema: list type without content: ",$decl->get_decl_path);
my $cpath = $cdecl->get_decl_path;
$cpath=~s/^!//;
my $src = $schema_name.'__generated_read_list@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $c=$p->[XAT_CHILDREN];
my $a=$p->[XAT_ATTRS];
return undef unless $c and @$c or $a and @$a;
my @list;
my $singleton = $a && @$a ? 1 : 0;
unless ($singleton) {
for my $el (@$c) {
if (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
$singleton = 1;
last;
} elsif ($el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT) {
$singleton = 1 if $el->[XAT_NAME] ne 'LM' and $el->[XAT_NS] == $pml_ns_index;
last;
}
}}
if ($singleton) {
@list = ($handlers{ '`.$cpath.q`' }->($p));
} else {
for my $el (@$c) {
if (ref($el) and $el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
$el->[XAT_NAME] eq 'LM' or _report_error(q(Unexpected non-LM element ').$el->[XAT_NAME].q(' in a list: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
push @list, $handlers{ '`.$cpath.q`' }->($el);
} elsif (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
_report_error(q(Unexpected text content in a list '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
}
}`;
if (!$trees_type and $decl->get_role eq '#TREES' and $BUILD_TREES) {
my $cdecl_type = $cdecl->get_decl_type;
unless ($cdecl && ($cdecl->get_role||'') eq '#NODE' &&
($cdecl_type == PML_STRUCTURE_DECL or
$cdecl_type == PML_CONTAINER_DECL)) {
_report_error("List '$path' with role #TREES may only contain structures or containers with role #NODE in schema ".
$decl->get_schema->get_url."\n");
}
$trees_type = $decl;
$sub .= q`
unless ($have_trees) {
$have_trees = 1;
$ctxt->{'_pml_trees_type'} = $trees_type;
$ctxt->{'_trees'} = Treex::PML::Factory->createList(\@list,1);
return;
}`;
}
$sub .= q`
return Treex::PML::Factory->createList(\@list,1);
}`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_ALT_DECL) {
# print $path."\t@".$decl->get_decl_type_str,"\n";
my $cpath = $decl->get_content_decl->get_decl_path;
$cpath=~s/^!//;
my $src = $schema_name.'__generated_read_alt@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $c=$p->[XAT_CHILDREN];
my $a=$p->[XAT_ATTRS];
return undef unless $c and @$c or $a and @$a;
my $singleton = $a && @$a ? 1 : 0;
unless ($singleton) {
for my $el (@$c) {
if (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
$singleton = 1;
last;
} elsif ($el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
$singleton = 1 if $el->[XAT_NAME] ne 'AM';
last;
}
}
}
if ($singleton) {
return $handlers{ '`.$cpath.q`' }->($p);
} else {
my @alt;
for my $el (@$c) {
if (ref($el) and $el->[XAT_TYPE] == XML_READER_TYPE_ELEMENT and $el->[XAT_NS] == $pml_ns_index) {
$el->[XAT_NAME] eq 'AM' or _report_error(q(Unexpected non-AM element ').$el->[XAT_NAME].q(' in an alt: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
push @alt, $handlers{ '`.$cpath.q`' }->($el);
} elsif (!ref($el) or $el->[XAT_TYPE] == XML_READER_TYPE_TEXT or $el->[XAT_TYPE] == XML_READER_TYPE_CDATA) {
_report_error(q(Unexpected text content in an alt: '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
}
return @alt == 0 ? undef : @alt == 1 ? $alt[0] :
#return bless \@alt, 'Treex::PML::Alt';
Treex::PML::Factory->createAlt(\@alt,1);
}
}
`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_CDATA_DECL) {
my $src = $schema_name.'__generated_read_cdata@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $text;
if (ref($p)) {
my $c = $p->[XAT_CHILDREN];
return undef unless $c and @$c;
my $type;
$text = join '',
map {
if (ref($_)) {
$type = $_->[XAT_TYPE];
if ($type == XML_READER_TYPE_TEXT ||
$type == XML_READER_TYPE_WHITESPACE ||
$type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ||
$type == XML_READER_TYPE_CDATA) {
$_->[XAT_VALUE]
} elsif ($type == XML_READER_TYPE_ELEMENT) {
_report_error(q(Element found where only character data were expected in element <).$_->[XAT_NAME].q(> of CDATA type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
} else {
$_
}
} @$c;`;
my $format_checker;
if ($VALIDATE_CDATA and $decl->get_format ne 'any') {
$sub .=q`
} else {
$text = $p;
}`;
$format_checker = $decl->_get_format_checker();
if (defined $format_checker) {
if (ref($format_checker) eq 'CODE') {
$sub .= q`
if (defined $text and length $text and !$format_checker->($text)) {`;
} else {
$sub .= q`
if (defined $text and length $text and $text !~ $format_checker) {`;
}
$sub .= q`
warn("CDATA value '$text' does not conform to format '`.$decl->get_format.q`' at ".$pml_file.' line '.$p->[XAT_LINE]);
}`;
}
$sub .= q`
return $text;
}`;
} else {
$sub .=q`
return $text;
} else {
return $p;
}
}`;
}
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_CHOICE_DECL) {
# print $path,"\n";
my $value_hash = $decl->{value_hash};
unless ($value_hash) {
$value_hash={};
@{$value_hash}{@{$decl->{values}}}=();
$decl->{value_hash}=$value_hash;
}
my $src = $schema_name.'__generated_read_choice@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $text;
if (ref($p)) {
my $c = $p->[XAT_CHILDREN];
return undef unless @$c;
$c=$c->[0];
if (ref($c)) {
my $type = $c->[XAT_TYPE];
if ($type == XML_READER_TYPE_TEXT ||
$type == XML_READER_TYPE_WHITESPACE ||
$type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ||
$type == XML_READER_TYPE_CDATA) {
$text = $c->[XAT_VALUE]
} elsif ($type == XML_READER_TYPE_ELEMENT) {
_report_error(q(Element found where only character data were expected in element <).$p->[XAT_NAME].q(> of choice type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
} else {
$text = $c;
}
} else {
$text=$p;
}
return undef unless defined $text;
exists($value_hash->{$text}) or _report_error(qq(Invalid value '$text' in element <).$p->[XAT_NAME].q(> of choice type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
return $text;
}`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
} elsif ($decl_type == PML_CONSTANT_DECL) {
# print $path,"\n";
my $value = quotemeta($decl->{value});
my $src = $schema_name.'__generated_read_constant@'.$path;
$src=~y{/}{@};
my $sub = q`#line 0 ".pml_compile.d/`.$src.q`"
sub {
my ($p)=@_;
my $text;
if (ref($p)) {
my $c = $p->[XAT_CHILDREN];
return undef unless $c and @$c;
$c=$c->[0];
if (ref($c)) {
my $type = $c->[XAT_TYPE];
if ($type == XML_READER_TYPE_TEXT ||
$type == XML_READER_TYPE_WHITESPACE ||
$type == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ||
$type == XML_READER_TYPE_CDATA) {
$text = $c->[XAT_VALUE]
} elsif ($type == XML_READER_TYPE_ELEMENT) {
_report_error(q(Unexpected element occurrence in element <).$p->[XAT_NAME].q(> of constant type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
}
} else {
$text = $c;
}
} else {
$text=$p;
}
!(defined($text) and length($text)) or ($text eq "`.$value.q`") or
_report_error(qq(Invalid value '$text' in element <).$p->[XAT_NAME].q(> of constant type '`.$path.q`' at ).$pml_file.' line '.$p->[XAT_LINE]);
return $text;
}`;
# print $sub;
$src{$src}=$sub;
$handlers{$path} = eval $sub; die _nl($sub)."\n".$@.' ' if $@;
}
# print "@_\n";
});
$schema->for_each_decl(
sub {
my ($decl)=@_;
my $decl_type=$decl->get_decl_type;
if ($decl_type == PML_ATTRIBUTE_DECL ||
$decl_type == PML_MEMBER_DECL ||
$decl_type == PML_ELEMENT_DECL
) {
my $parent = $decl->get_parent_decl;
my $path = $parent->get_decl_path . '/'. $decl->get_name;
$path =~ s/^!// if $path;
my $mdecl;
if ($decl_type == PML_MEMBER_DECL and $decl->is_required) {
# a hack that fixes missing content of a required member
# containing a construct with the role #TREES
#
# the modified handler returns string '#TREES' instead
# and the value gets deleted in the structure handler
$mdecl = $decl->get_content_decl;
if ($mdecl->get_role eq '#TREES' and $mdecl==$trees_type) {
my $mpath = $mdecl->get_decl_path;
$mpath =~ s/^!// if $mpath;
my $handler = $handlers{$mpath};
$handlers{$path}=sub {
if (!$have_trees and $BUILD_TREES) {
my $ret = &$handler;
return '#TREES' if $have_trees and !defined($ret);
return $ret;
} else {
return &$handler;
}
};
return;
}
}
# print "$path\n";
if (!exists($handlers{$path})) {
$mdecl ||= $decl->get_content_decl;
my $mpath = $mdecl && $mdecl->get_decl_path;
if ($mpath) {
$mpath =~ s/^!//;
# print "mapping $path -> $mpath ... $handlers{$mpath}\n";
$handlers{$path} = $handlers{$mpath};
}
}
}
});
}
sub _nl {
my ($str)=@_;
my $i=0;
return join "\n", map sprintf("%4d\t",$i++).$_, split /\n/, $str;
}
}
{
# outside the main blog so that we leak no lexicals other than $dom
sub perl_transform {
return eval shift();
}
}
1;
__END__
=head1 NAME
Treex::PML::Instance::Reader
=head1 DESCRIPTION
This module provides implements the load() method of
L<Treex::PML::Instance> and is not intended for direct use.
=head1 IMPLEMENTATION NOTES
The module analyses a L<Treex::PML::Schema> and generates Perl code to
parse PML instances conforming to that schema (by generating handlers
for individual data types). L<XML::CompactTree::XS> (or
L<XML::CompactTree>) is used to first slurp the XML into in-memory
Perl data structures (much faster than using SAX, XML::LibXML::Reader,
DOM, or any other parsing strategy known). The Perl code generated by
this module transforms these data structures into the coresponding
L<Treex::PML> objects.
The handlers for last 50 PML schemas are cached in memory, to boost
processing large collections of PML instances conforming to only a few
distinct schemas.
The module also implements automatic pluggable XSLT, external command,
or Perl pre-processing (transformation) of the input document; this
pre-processing can be specified in a configuration file
(C<pmlbackend_conf.xml>, see L<Treex::PML::Instance/"CONFIGURATION">
for more details).
=head1 DEBUGGING
If the environment variable PML_COMPILE_DUMP=1 is set, the module
dumps the generated code to the C<.pml_compile.d/> folder in the
current working directory. This is very for debugging or profiling the
generated code.
=head1 SEE ALSO
L<Treex::PML::Instance>, L<Treex::PML::Instance::Writer>,
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2008-2010 by Petr Pajas
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.2 or,
at your option, any later version of Perl 5 you may have available.
=head1 BUGS
None reported... yet.
=cut