lib/Treex/Block/Read/BaseTextReader.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
—
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
              package Treex::Block::Read::BaseTextReader;
$Treex::Block::Read::BaseTextReader::VERSION = '2.20210102';
use Moose;
use Treex::Core::Common;
extends 'Treex::Block::Read::BaseReader';
#use File::Slurp 9999;
use PerlIO::via::gzip;
# By default read from STDIN
has '+from' => (
    default => '-',
    handles => [qw(current_filename current_filehandle file_number _set_file_number next_filehandle)],
);
has lines_per_doc => ( isa => 'Int',                   is => 'ro', default  => 0 );
has merge_files   => ( isa => 'Bool',                  is => 'ro', default  => 0 );
has encoding      => ( isa => 'Str',                   is => 'ro', default  => 'utf8' );
sub BUILD {
    my ($self) = @_;
    if ( $self->lines_per_doc ) {
        $self->set_is_one_doc_per_file(0);
    }
    if ($self->encoding ne 'utf8'){
        $self->from->set_encoding($self->encoding);
    }
    return;
}
sub next_document_text {   
    my ($self) = @_;
    if ( $self->is_one_doc_per_file ) {
        return $self->from->next_file_text();
    }
    my $text = '';
    LINE:
    for my $line ( 1 .. $self->lines_per_doc ) {
        $line = $self->from->next_line();
        if (!defined $line){
            return if $text eq '' && !$self->from->has_next_file();
            last LINE;
        }
         
        $text .= $line;
    }
    return $text;
}
1;
__END__
=pod
=encoding utf-8
=for Pod::Coverage BUILD
=head1 NAME
Treex::Block::Read::BaseTextReader - abstract ancestor for document readers
=head1 VERSION
version 2.20210102
=head1 DESCRIPTION
This class serves as an common ancestor for document readers,
that have parameter C<from> with a space or comma separated list of filenames
to be loaded and load the documents from plain text files.
It is designed to implement the L<Treex::Core::DocumentReader> interface.
In derived classes you need to define the C<next_document> method,
and you can use C<next_document_text> and C<new_document> methods.
=head1 ATTRIBUTES
=over
=item language (required)
=item lines_per_doc
If you want to split one file to more documents.
The default is 0 which means, don't split.
=item merge_files
Merge the content of all files (specified in C<from> attribute) into one stream.
Useful in combination with C<lines_per_doc> to get equally-sized documents
even from non-equally-sized files.
=item encoding
What is the encoding of the input files. E.g. C<utf8> (the default), C<cp1250> etc.
=back
=head1 METHODS
=over
=item next_document_text
Returns a content of each file (specified in C<from> attribute) as a text string.
=item next_filehandle
Helper method - you can use this instead of C<next_document_text>
if you don't want to load the whole text into memory
(but do e.g. SAX-like parsing).
=back
=head1 SEE
L<Treex::Block::Read::BaseReader>
L<Treex::Block::Read::Text>
=head1 AUTHOR
Martin Popel
=head1 COPYRIGHT AND LICENSE
Copyright © 2011-2012 by Institute of Formal and Applied Linguistics, Charles University in Prague
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)