Perl x Open Food Facts Hackathon: Paris, France - May 24-25 Learn more

$Treex::Block::Read::BaseTextReader::VERSION = '2.20210102';
use Moose;
#use File::Slurp 9999;
# By default read from STDIN
has '+from' => (
default => '-',
handles => [qw(current_filename current_filehandle file_number _set_file_number next_filehandle)],
);
has lines_per_doc => ( isa => 'Int', is => 'ro', default => 0 );
has merge_files => ( isa => 'Bool', is => 'ro', default => 0 );
has encoding => ( isa => 'Str', is => 'ro', default => 'utf8' );
sub BUILD {
my ($self) = @_;
if ( $self->lines_per_doc ) {
$self->set_is_one_doc_per_file(0);
}
if ($self->encoding ne 'utf8'){
$self->from->set_encoding($self->encoding);
}
return;
}
sub next_document_text {
my ($self) = @_;
if ( $self->is_one_doc_per_file ) {
return $self->from->next_file_text();
}
my $text = '';
LINE:
for my $line ( 1 .. $self->lines_per_doc ) {
$line = $self->from->next_line();
if (!defined $line){
return if $text eq '' && !$self->from->has_next_file();
last LINE;
}
$text .= $line;
}
return $text;
}
1;
__END__
=pod
=encoding utf-8
=for Pod::Coverage BUILD
=head1 NAME
Treex::Block::Read::BaseTextReader - abstract ancestor for document readers
=head1 VERSION
version 2.20210102
=head1 DESCRIPTION
This class serves as an common ancestor for document readers,
that have parameter C<from> with a space or comma separated list of filenames
to be loaded and load the documents from plain text files.
It is designed to implement the L<Treex::Core::DocumentReader> interface.
In derived classes you need to define the C<next_document> method,
and you can use C<next_document_text> and C<new_document> methods.
=head1 ATTRIBUTES
=over
=item language (required)
=item lines_per_doc
If you want to split one file to more documents.
The default is 0 which means, don't split.
=item merge_files
Merge the content of all files (specified in C<from> attribute) into one stream.
Useful in combination with C<lines_per_doc> to get equally-sized documents
even from non-equally-sized files.
=item encoding
What is the encoding of the input files. E.g. C<utf8> (the default), C<cp1250> etc.
=back
=head1 METHODS
=over
=item next_document_text
Returns a content of each file (specified in C<from> attribute) as a text string.
=item next_filehandle
Helper method - you can use this instead of C<next_document_text>
if you don't want to load the whole text into memory
(but do e.g. SAX-like parsing).
=back
=head1 SEE
L<Treex::Block::Read::BaseReader>
L<Treex::Block::Read::Text>
=head1 AUTHOR
Martin Popel
=head1 COPYRIGHT AND LICENSE
Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.