#!/usr/bin/perl -w
use
FindBin
qw( $RealBin $RealScript)
;
import
xml_split::state::parser;
import
xml_split::state::twig;
undef
$Getopt::Std::STANDARD_HELP_VERSION
;
$Getopt::Std::STANDARD_HELP_VERSION
=1;
use
vars
qw( $VERSION $USAGE)
;
$VERSION
=
"0.06"
;
$USAGE
=
"xml_split [-l <level> [-s <size> | -g <nb_grouped>] | -c <cond>] [-b <base>] [-n <nb>] [-e <ext>] [-p <plugin>] [-I <plugin_dir>] [-i] [-d] [-v] [-h] [-m] [-V] <files>\n"
;
{
my
$opt
={};
getopts(
'l:c:b:g:n:e:p:is:dvhmV'
,
$opt
);
$opt
->{n} ||= 2;
$opt
->{I} ||= (
$ENV
{HOME} ||
''
) .
"/.xml_split"
;
if
(
$opt
->{h}) {
die
$USAGE
,
"\n"
; }
if
(
$opt
->{m}) {
exec
"pod2text $RealBin/$RealScript"
; }
if
(
$opt
->{V}) {
print
"xml_split version $VERSION\n"
;
exit
; }
my
%factor
=(
' '
=> 1,
K
=> 1000,
M
=> 1_000_000,
G
=> 1_000_000_000);
if
(
$opt
->{s}) {
if
(
$opt
->{c}) {
die
"cannot use -c and -s at the same time\n"
; }
if
(
$opt
->{s}=~ m{^\s*(\d+)\s*(G[bo]?|M[bo]?|K[bo]?\s*)?$}i)
{
my
(
$size
,
$unit
)= ($1,
uc
substr
( $2 ||
' '
, 0, 1));
$opt
->{s}=
$size
*
$factor
{
$unit
};
}
else
{
die
"invalid size (should be in Kb, Mb or Gb): '$opt->{s}'\n"
; }
}
if
(
$opt
->{g}) {
die
"cannot use -g and -s at the same time\n"
if
(
$opt
->{s});
die
"cannot use -g and -c at the same time\n"
if
(
$opt
->{c});
$opt
->{l} ||= 1;
}
elsif
(
$opt
->{c}) {
die
"cannot use -l and -c at the same time\n"
if
(
$opt
->{l}); }
else
{
$opt
->{l} ||= 1;
$opt
->{c}=
"level( $opt->{l})"
; }
my
$options
= {
cond
=>
$opt
->{c},
base
=>
$opt
->{b},
nb_digits
=>
$opt
->{n},
ext
=>
$opt
->{e},
plugin
=>
$opt
->{p},
no_pi
=>
$opt
->{d},
verbose
=>
$opt
->{v},
xinclude
=>
$opt
->{i} ? 1 : 0,
};
my
$state
;
if
(
my
$plugin
=
$opt
->{p})
{
if
(
$plugin
!~ m{^[\w:.-]+$}) {
die
"wrong plugin name '$plugin' (only word characters are allowed in plugin names)\n"
; }
push
@INC
,
$opt
->{I};
eval
{
require
$plugin
};
if
( $@) {
die
"cannot find plugin '$plugin': $!"
; }
import
$plugin
;
$state
=
$plugin
->new(
$options
);
}
if
(
$opt
->{s})
{
$state
||= xml_split::state::parser->new(
$options
);
$state
->{level} =
$opt
->{l};
$state
->{size} =
$opt
->{s};
$state
->{current_size}=0;
$state
->{handlers}= {
Start
=> \
&parser_start_tag_size
,
End
=> \
&parser_end_tag_size
,
Default
=> \
&parser_default_size
};
warn
"using XML::Parser\n"
if
(
$opt
->{v});
split_with_parser(
$state
,
@ARGV
);
}
elsif
(
$opt
->{g})
{
$state
||= xml_split::state::parser->new(
$options
);
$state
->{level}=
$opt
->{l};
$state
->{group}=
$opt
->{g};
$state
->{handlers}= {
Start
=> \
&parser_start_tag_grouped
,
End
=> \
&parser_end_tag_grouped
,
Default
=> \
&parser_default_grouped
};
warn
"using XML::Parser\n"
if
(
$opt
->{v});
split_with_parser(
$state
,
@ARGV
);
}
elsif
(
$opt
->{l})
{
$state
||= xml_split::state::parser->new(
$options
);
$state
->{level}=
$opt
->{l};
$state
->{handlers}= {
Start
=> \
&parser_start_tag_level
,
End
=> \
&parser_end_tag_level
,
Default
=> \
&parser_default_level
};
warn
"using XML::Parser\n"
if
(
$opt
->{v});
split_with_parser(
$state
,
@ARGV
);
}
else
{
$state
||= xml_split::state::twig->new(
$options
);
split_with_twig(
$state
,
@ARGV
);
}
exit
;
}
sub
split_with_twig
{
my
(
$state
,
@files
)=
@_
;
if
( !
@files
)
{
$state
->{base} ||=
'out'
;
$state
->{ext} ||=
'.xml'
;
my
$twig_options
= twig_options(
$state
);
my
$t
= XML::Twig->new(
%$twig_options
,
$state
);
$state
->{twig}=
$t
;
$t
->parse( \
*STDIN
);
end_file(
$t
,
$state
);
}
else
{
foreach
my
$file
(
@files
)
{
unless
(
$state
->{base}) {
$state
->{seq_nb}=0; }
my
(
$base
,
$ext
)= (
$file
=~ m{^(.*?)(\.\w+)?$});
$state
->{base} ||=
$base
;
$state
->{ext} ||=
$ext
||
'.xml'
;
my
$twig_options
= twig_options(
$state
);
my
$t
= XML::Twig->new(
%$twig_options
);
$state
->{twig}=
$t
;
$t
->parsefile(
$file
);
end_file(
$t
,
$state
);
}
}
}
sub
split_with_parser
{
my
(
$state
,
@files
)=
@_
;
if
( !
@files
)
{
$state
->{base} ||=
'out'
;
$state
->{ext} ||=
'.xml'
;
my
$parser_options
= parser_options(
$state
);
my
$p
= XML::Parser->new(
%$parser_options
);
$state
->{parser}=
$p
;
$p
->parse( \
*STDIN
);
}
else
{
foreach
my
$file
(
@files
)
{
unless
(
$state
->{base}) {
$state
->{seq_nb}=0; }
my
(
$base
,
$ext
)= (
$file
=~ m{^(.*?)(\.\w+)?$});
$state
->{base} ||=
$base
;
$state
->{ext} ||=
$ext
||
'.xml'
;
my
$parser_options
= parser_options(
$state
);
my
$p
= XML::Parser->new(
%$parser_options
);
$state
->{parser}=
$p
;
$p
->parsefile(
$file
);
}
}
}
sub
parser_options
{
my
(
$state
)=
@_
;
unless
(
$state
->{no_pi})
{
my
$file_name
=
$state
->main_file_name();
warn
"generating main file $file_name\n"
if
(
$state
->{verbose});
open
(
my
$out
,
'>'
,
$file_name
) or
die
"cannot create main file '$file_name': $!"
;
$state
->{main_fh}=
$out
;
$state
->{current_fh}=
$out
;
}
my
$handlers
= {
Start
=>
sub
{
$state
->{handlers}->{Start}->(
$state
,
shift
(
@_
)); },
End
=>
sub
{
$state
->{handlers}->{End}->(
$state
,
shift
(
@_
)); },
Default
=>
sub
{
$state
->{handlers}->{Default}->(
$state
,
shift
(
@_
)); },
XMLDecl
=>
sub
{ parser_declaration(
$state
,
@_
); },
};
return
{
Handlers
=>
$handlers
};
}
sub
parser_start_tag_level
{
my
(
$state
,
$p
)=
@_
;
if
(
$p
->depth ==
$state
->{level})
{
$state
->{seq_nb}++;
my
$file_name
=
$state
->file_name;
warn
"generating $file_name\n"
if
(
$state
->{verbose});
open
(
my
$out
,
'>'
,
$file_name
) or
die
"cannot create output file '$file_name': $!"
;
$state
->{current_fh}=
$out
;
if
(
$state
->{xml_declaration}) {
print
{
$state
->{current_fh}}
$state
->{xml_declaration},
"\n"
; }
unless
(
$state
->{no_pi})
{
print
{
$state
->{main_fh}}
$state
->include(
$file_name
) ; }
}
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
}
sub
parser_end_tag_level
{
my
(
$state
,
$p
)=
@_
;
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
if
(
$p
->depth ==
$state
->{level})
{
unless
(
$state
->{current_fh} ==
$state
->{main_fh})
{
close
$state
->{current_fh};
$state
->{current_fh}=
$state
->{main_fh};
}
}
}
sub
parser_default_level
{
my
(
$state
,
$p
)=
@_
;
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
}
sub
parser_start_tag_size
{
my
(
$state
,
$p
)=
@_
;
if
(
$p
->depth ==
$state
->{level} && !
$state
->{current_size})
{
$state
->{seq_nb}++;
my
$file_name
=
$state
->file_name;
warn
"generating $file_name\n"
if
(
$state
->{verbose});
open
(
my
$out
,
'>'
,
$file_name
) or
die
"cannot create output file '$file_name': $!"
;
$state
->{current_fh}=
$out
;
print
{
$state
->{current_fh}}
qq{$state->{xml_declaration}
\n}
if
$state
->{xml_declaration};
unless
(
$state
->{no_pi})
{
print
{
$state
->{main_fh}}
$state
->include(
$file_name
) ; }
$state
->{store_size}=1;
}
my
$original_string
=
$p
->original_string;
$state
->{current_size} +=
length
(
$original_string
)
if
(
$state
->{store_size});
print
{
$state
->{current_fh}}
$original_string
if
(
$state
->{current_fh});
}
sub
parser_end_tag_size
{
my
(
$state
,
$p
)=
@_
;
my
$original_string
=
$p
->original_string;
$state
->{current_size} +=
length
(
$original_string
)
if
(
$state
->{store_size});
if
(
$p
->depth ==
$state
->{level} &&
$state
->{current_size} >
$state
->{size})
{
print
{
$state
->{current_fh}}
$original_string
if
(
$state
->{current_fh});
end_file_with_size(
$state
);
}
else
{
if
(
$p
->depth <
$state
->{level}) { end_file_with_size(
$state
); }
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
}
}
sub
end_file_with_size
{
my
(
$state
)=
@_
;
unless
(
$state
->{current_fh} ==
$state
->{main_fh})
{
print
{
$state
->{current_fh}}
qq{\n</xml_split:root>\n}
;
close
$state
->{current_fh};
$state
->{current_size}=0;
$state
->{store_size}=0;
$state
->{current_fh}=
$state
->{main_fh};
}
}
sub
parser_default_size
{
my
(
$state
,
$p
)=
@_
;
my
$string
=
$p
->original_string;
if
(
$state
->{store_size})
{
$state
->{current_size} +=
length
(
$string
);
if
(
$p
->depth <
$state
->{level}) { end_file_with_size(
$state
); }
}
print
{
$state
->{current_fh}}
$string
if
(
$state
->{current_fh});
}
sub
parser_start_tag_grouped
{
my
(
$state
,
$p
)=
@_
;
if
(
$p
->depth ==
$state
->{level})
{
if
( !
$state
->{current_nb})
{
$state
->{seq_nb}++;
my
$file_name
=
$state
->file_name;
warn
"generating $file_name\n"
if
(
$state
->{verbose});
open
(
my
$out
,
'>'
,
$file_name
) or
die
"cannot create output file '$file_name': $!"
;
$state
->{current_fh}=
$out
;
print
{
$state
->{current_fh}}
join
(
"\n"
,
grep
{
$_
} (
$state
->{xml_declaration},
)
);
unless
(
$state
->{no_pi})
{
print
{
$state
->{main_fh}}
$state
->include(
$file_name
) ; }
}
}
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
}
sub
parser_end_tag_grouped
{
my
(
$state
,
$p
)=
@_
;
if
(
$p
->depth ==
$state
->{level})
{
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
$state
->{current_nb}++;
if
(
$state
->{current_nb} ==
$state
->{group}) { end_file_grouped(
$state
); }
}
else
{
if
(
$p
->depth <
$state
->{level}) { end_file_grouped(
$state
, {
no_nl
=> 1 }); }
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
}
}
sub
end_file_grouped
{
my
(
$state
,
$options
)=
@_
;
print
{
$state
->{current_fh}}
qq{\n}
unless
(
$options
->{no_nl});
unless
(
$state
->{current_fh} ==
$state
->{main_fh})
{
print
{
$state
->{current_fh}}
qq{</xml_split:root>\n}
;
close
$state
->{current_fh};
$state
->{current_nb}=0;
$state
->{current_fh}=
$state
->{main_fh};
}
}
sub
parser_default_grouped
{
my
(
$state
,
$p
)=
@_
;
print
{
$state
->{current_fh}}
$p
->original_string
if
(
$state
->{current_fh});
}
sub
char_parser
{
my
(
$state
,
$p
)=(
shift
,
shift
);
print
{
$state
->{current_fh}}
$_
[0]
if
(
$state
->{current_fh});
}
sub
parser_declaration
{
my
(
$state
,
$p
,
$version
,
$encoding
,
$standalone
)=
@_
;
$state
->{xml_declaration}=
$p
->recognized_string ||
''
;
print
{
$state
->{main_fh}}
$state
->{xml_declaration};
}
sub
twig_options
{
my
(
$state
)=
@_
;
my
$twig_options
= {
keep_encoding
=> 1,
keep_spaces
=> 1 };
unless
(
$state
->{no_pi})
{
my
$file_name
=
$state
->main_file_name();
warn
"generating main file $file_name\n"
if
(
$state
->{verbose});
open
(
my
$out
,
'>'
,
$file_name
) or
die
"cannot create main file '$file_name': $!"
;
$state
->{out}=
$out
;
$twig_options
->{twig_print_outside_roots}=
$out
;
$twig_options
->{start_tag_handlers}= {
$state
->{cond} =>
sub
{
$_
->set_att(
'#in_fragment'
=> 1); } };
}
$twig_options
->{twig_roots}= {
$state
->{cond} =>
sub
{ dump_elt(
@_
,
$state
); } };
return
$twig_options
;
}
sub
dump_elt
{
my
(
$t
,
$elt
,
$state
)=
@_
;
$state
->{seq_nb}++;
$state
->{elt}=
$elt
;
my
$file_name
=
$state
->file_name;
warn
"generating $file_name\n"
if
(
$state
->{verbose});
my
$fragment
= XML::Twig->new();
$fragment
->{twig_xmldecl} =
$t
->{twig_xmldecl};
$fragment
->{twig_doctype} =
$t
->{twig_doctype};
$fragment
->{twig_dtd} =
$t
->{twig_dtd};
if
( !
$state
->{no_pis})
{
my
$include
=
$state
->include(
$file_name
);
$elt
->del_att(
'#in_fragment'
);
if
(
$elt
->inherited_att(
'#in_fragment'
))
{
$elt
->parent(
'*[@#in_fragment="1"]'
)->set_att(
'#has_subdocs'
=> 1);
$include
->replace(
$elt
);
}
else
{
$elt
->cut;
$include
->
print
(
$state
->{out});
}
}
else
{
$elt
->cut; }
$fragment
->set_root(
$elt
);
open
(
my
$out
,
'>'
,
$file_name
) or
die
"cannot create output file '$file_name': $!"
;
$fragment
->set_keep_encoding( 1);
$fragment
->
print
(
$out
);
close
$out
;
}
sub
end_file
{
my
(
$t
,
$state
)=
@_
;
unless
(
$state
->{no_pi})
{
close
$state
->{out}; }
}
sub
HELP_MESSAGE {
return
$USAGE
; }
sub
VERSION_MESSAGE {
return
$VERSION
; }
sub
new
{
my
(
$ref
,
$options
)=
@_
;
my
$state
=
bless
$options
,
$ref
;
$state
->{seq_nb}=0;
return
$state
;
}
sub
file_name
{
my
(
$state
)=
@_
;
my
$nb
=
sprintf
(
"%0$state->{nb_digits}d"
,
$state
->{seq_nb});
my
$file_name
=
"$state->{base}-$nb$state->{ext}"
;
$file_name
=~ s{\\}{/}g;
return
$file_name
;
}
sub
main_file_name
{
my
(
$state
)=
@_
;
my
$nb
=
sprintf
(
"%0$state->{nb_digits}d"
, 0);
my
$file_name
=
"$state->{base}-$nb$state->{ext}"
;
return
$file_name
;
}
1;
import
xml_split::state;
sub
include
{
my
(
$state
,
$file_name
)=
@_
;
if
(
$state
->{xinclude})
{
return
qq{<xi:include href="$file_name" />}
; }
else
{
return
qq{<?merge subdocs = 0 :$file_name?>}
; }
}
1;
import
xml_split::state;
sub
include
{
my
(
$state
,
$file_name
)=
@_
;
my
$include
;
my
$subdocs
=
$state
->{elt}->att(
'#has_subdocs'
) || 0;
if
(
$state
->{xinclude})
{
$include
= XML::Twig::Elt->new(
'xi:include'
, {
href
=>
$file_name
});
if
(
$subdocs
) {
$include
->set_att(
subdocs
=> 1); }
}
else
{
$include
= XML::Twig::Elt->new(
'#PI'
)
->set_pi(
merge
=>
" subdocs = $subdocs :$file_name"
);
}
return
$include
;
}
1;