local-lib/lib/perl5/x86_64-linux/HTML/HeadParser.pm


            
              1
—
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
—
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
—
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
—
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
              package HTML::HeadParser;
=head1 NAME
HTML::HeadParser - Parse <HEAD> section of a HTML document
=head1 SYNOPSIS
 require HTML::HeadParser;
 $p = HTML::HeadParser->new;
 $p->parse($text) and  print "not finished";
 $p->header('Title')          # to access <title>....</title>
 $p->header('Content-Base')   # to access <base href="http://...">
 $p->header('Foo')            # to access <meta http-equiv="Foo" content="...">
 $p->header('X-Meta-Author')  # to access <meta name="author" content="...">
 $p->header('X-Meta-Charset') # to access <meta charset="...">
=head1 DESCRIPTION
The C<HTML::HeadParser> is a specialized (and lightweight)
C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
section of an HTML document.  The parse() method
will return a FALSE value as soon as some E<lt>BODY> element or body
text are found, and should not be called again after this.
Note that the C<HTML::HeadParser> might get confused if raw undecoded
UTF-8 is passed to the parse() method.  Make sure the strings are
properly decoded before passing them on.
The C<HTML::HeadParser> keeps a reference to a header object, and the
parser will update this header object as the various elements of the
E<lt>HEAD> section of the HTML document are recognized.  The following
header fields are affected:
=over 4
=item Content-Base:
The I<Content-Base> header is initialized from the E<lt>base
href="..."> element.
=item Title:
The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
element.
=item Isindex:
The I<Isindex> header will be added if there is a E<lt>isindex>
element in the E<lt>head>.  The header value is initialized from the
I<prompt> attribute if it is present.  If no I<prompt> attribute is
given it will have '?' as the value.
=item X-Meta-Foo:
All E<lt>meta> elements containing a C<name> attribute will result in
headers using the prefix C<X-Meta-> appended with the value of the
C<name> attribute as the name of the header, and the value of the
C<content> attribute as the pushed header value.
E<lt>meta> elements containing a C<http-equiv> attribute will result
in headers as in above, but without the C<X-Meta-> prefix in the
header name.
E<lt>meta> elements containing a C<charset> attribute will result in
an C<X-Meta-Charset> header, using the value of the C<charset>
attribute as the pushed header value.
=back
=head1 METHODS
The following methods (in addition to those provided by the
superclass) are available:
=over 4
=cut
require HTML::Parser;
@ISA = qw(HTML::Parser);
use HTML::Entities ();
use strict;
use vars qw($VERSION $DEBUG);
#$DEBUG = 1;
$VERSION = "3.69";
=item $hp = HTML::HeadParser->new
=item $hp = HTML::HeadParser->new( $header )
The object constructor.  The optional $header argument should be a
reference to an object that implement the header() and push_header()
methods as defined by the C<HTTP::Headers> class.  Normally it will be
of some class that is a or delegates to the C<HTTP::Headers> class.
If no $header is given C<HTML::HeadParser> will create an
C<HTTP::Headers> object by itself (initially empty).
=cut
sub new
{
    my($class, $header) = @_;
    unless ($header) {
        require HTTP::Headers;
        $header = HTTP::Headers->new;
    }
    my $self = $class->SUPER::new(api_version => 3,
                                  start_h => ["start", "self,tagname,attr"],
                                  end_h   => ["end",   "self,tagname"],
                                  text_h  => ["text",  "self,text"],
                                  ignore_elements => [qw(script style)],
                                 );
    $self->{'header'} = $header;
    $self->{'tag'} = '';   # name of active element that takes textual content
    $self->{'text'} = '';  # the accumulated text associated with the element
    $self;
}
=item $hp->header;
Returns a reference to the header object.
=item $hp->header( $key )
Returns a header value.  It is just a shorter way to write
C<$hp-E<gt>header-E<gt>header($key)>.
=cut
sub header
{
    my $self = shift;
    return $self->{'header'} unless @_;
    $self->{'header'}->header(@_);
}
sub as_string    # legacy
{
    my $self = shift;
    $self->{'header'}->as_string;
}
sub flush_text   # internal
{
    my $self = shift;
    my $tag  = $self->{'tag'};
    my $text = $self->{'text'};
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
    print "FLUSH $tag => '$text'\n"  if $DEBUG;
    if ($tag eq 'title') {
        my $decoded;
        $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode;
        HTML::Entities::decode($text);
        utf8::encode($text) if $decoded;
        $self->{'header'}->push_header(Title => $text);
    }
    $self->{'tag'} = $self->{'text'} = '';
}
# This is an quote from the HTML3.2 DTD which shows which elements
# that might be present in a <HEAD>...</HEAD>.  Also note that the
# <HEAD> tags themselves might be missing:
#
# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
#                            SCRIPT* & META* & LINK*">
#
# <!ELEMENT HEAD O O  (%head.content)>
#
# From HTML 4.01:
#
# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT">
# <!ENTITY % head.content "TITLE & BASE?">
# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)>
#
# From HTML 5 as of WD-html5-20090825:
#
# One or more elements of metadata content, [...]
# => base, command, link, meta, noscript, script, style, title
sub start
{
    my($self, $tag, $attr) = @_;  # $attr is reference to a HASH
    print "START[$tag]\n" if $DEBUG;
    $self->flush_text if $self->{'tag'};
    if ($tag eq 'meta') {
        my $key = $attr->{'http-equiv'};
        if (!defined($key) || !length($key)) {
            if ($attr->{name}) {
                $key = "X-Meta-\u$attr->{name}";
            } elsif ($attr->{charset}) { # HTML 5 <meta charset="...">
                $key = "X-Meta-Charset";
                $self->{header}->push_header($key => $attr->{charset});
                return;
            } else {
                return;
            }
        }
        $self->{'header'}->push_header($key => $attr->{content});
    } elsif ($tag eq 'base') {
        return unless exists $attr->{href};
        (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5
        $self->{'header'}->push_header('Content-Base' => $base);
    } elsif ($tag eq 'isindex') {
        # This is a non-standard header.  Perhaps we should just ignore
        # this element
        $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
    } elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
        # Just remember tag.  Initialize header when we see the end tag.
        $self->{'tag'} = $tag;
    } elsif ($tag eq 'link') {
        return unless exists $attr->{href};
        # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
        my $href = delete($attr->{href});
        $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5
        my $h_val = "<$href>";
        for (sort keys %{$attr}) {
            next if $_ eq "/";  # XHTML junk
            $h_val .= qq(; $_="$attr->{$_}");
        }
        $self->{'header'}->push_header(Link => $h_val);
    } elsif ($tag eq 'head' || $tag eq 'html') {
        # ignore
    } else {
         # stop parsing
        $self->eof;
    }
}
sub end
{
    my($self, $tag) = @_;
    print "END[$tag]\n" if $DEBUG;
    $self->flush_text if $self->{'tag'};
    $self->eof if $tag eq 'head';
}
sub text
{
    my($self, $text) = @_;
    print "TEXT[$text]\n" if $DEBUG;
    unless ($self->{first_chunk}) {
        # drop Unicode BOM if found
        if ($self->utf8_mode) {
            $text =~ s/^\xEF\xBB\xBF//;
        }
        else {
            $text =~ s/^\x{FEFF}//;
        }
        $self->{first_chunk}++;
    }
    my $tag = $self->{tag};
    if (!$tag && $text =~ /\S/) {
        # Normal text means start of body
        $self->eof;
        return;
    }
    return if $tag ne 'title';
    $self->{'text'} .= $text;
}
BEGIN {
    *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;
}
1;
__END__
=back
=head1 EXAMPLE
 $h = HTTP::Headers->new;
 $p = HTML::HeadParser->new($h);
 $p->parse(<<EOT);
 <title>Stupid example</title>
 <base href="http://www.linpro.no/lwp/">
 Normal text starts here.
 EOT
 undef $p;
 print $h->title;   # should print "Stupid example"
=head1 SEE ALSO
L<HTML::Parser>, L<HTTP::Headers>
The C<HTTP::Headers> class is distributed as part of the
I<libwww-perl> package.  If you don't have that distribution installed
you need to provide the $header argument to the C<HTML::HeadParser>
constructor with your own object that implements the documented
protocol.
=head1 COPYRIGHT
Copyright 1996-2001 Gisle Aas. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)