bin/xhtml-valid - metacpan.org


            
              1
2
3
4
5
6
7
8
—
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
—
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
              #!/usr/bin/perl
use strict;
use warnings;
$|++;
my $VERSION = '0.12';
#----------------------------------------------------------------------------
=head1 NAME
xhtml-valid - test web page DTD validation.
=head1 SYNOPSIS
  xhtml-valid \\
         [-i|ignore file] \\
         ( [-r|root url]  | [-u|url url]   | [--ulist file] | \\
           [-p|path path] | [-f|file file] | [--flist file] | ) \\
         [-h|help] [-v|version]
=head1 DESCRIPTION
Using either URLs or flat files, this program attempts to validate web pages
according to their own DTD.
=cut
# -------------------------------------
# Library Modules
use Getopt::Long;
use Test::XHTML::Valid;
# -------------------------------------
# Variables
my %options;
my @IGNORE = (
    qr/^mailto/,
    qr/\.(xml|txt|pdf)$/i,
    qr/\.(tar\.gz|zip)$/i,
    qr/\.(mp4|avi|wmv)$/i,
    qr/\.(jpg|bmp|gif|png)$/i,
);
# -------------------------------------
# Program
##### INITIALISE #####
init_options();
##### MAIN #####
my $txv = Test::XHTML::Valid->new();
$txv->ignore_list(@IGNORE);
# dynamic pages
if($options{root}) {
    $txv->process_root($options{url});
} elsif($options{url}) {
    $txv->process_link($options{url});
} elsif($options{ulist}) {
    $txv->process_url_list($options{ulist});
# static pages
} elsif($options{flist}) {
    $txv->process_file_list($options{flist});
} elsif($options{file}) {
    $txv->process_file($options{file});
} elsif($options{path}) {
    $txv->process_path($options{path});
# oops!
} else {
    help(1);
}
$txv->process_retries();
my $results = $txv->process_results();
if($results->{FAIL}) {
    print $txv->errstr() . "\n";
    #my @errors = $txv->errors();
}
printf "%5s: %s\n", $_, ($results->{$_}||0)  for(qw(PAGES PASS FAIL NET));
# -------------------------------------
# Subroutines
sub init_options {
    GetOptions( \%options,
        'path|p=s',
        'file|f=s',
        'flist=s',
        'root|r=s',
        'url|u=s',
        'ulist=s',
        'ignore|i=s',
        'help|h',
        'version|v'
    ) or help(1);
    help(1)    if($options{help});
    help(0)    if($options{version});
    if(defined $options{path} && ! -d $options{path}) {
        print "ERROR: path not found - $options{path}\n";
        help(1);
    }
    for my $file ('file','flist','ulist') {
        if(defined $options{$file} && ! -f $options{$file}) {
            print "ERROR: file used in option '$file' not found [$options{$file}]\n";
            help(1);
        }
    }
    if(defined $options{ignore} && ! -f $options{ignore}) {
        my $fh = IO::File->new($options{ignore},'r') or die "Cannot read file [options{ignore}]: $!\n";
        while(<$fh>) {
            chomp;
            push @IGNORE, qr!$_!;
        }
    }
}
sub help {
    my $full = shift;
    if($full) {
        print <<HERE;
Usage: $0 [-h] [-v] \\
         [-i file]
         ( [-r url]  | [-u url]  | [--ulist file] \\
           [-p path] | [-f file] | [--flist file] )
  -i file       patterns used to ignore URLs (e.g. user login)
  -r url        root target URL for validating (multiple pages)
  -u url        target URL for validating (single page)
  -ulist file   file containing a list of target URLs
  -p path       target directory of XHTML files
  -f file       single target XHTML file path
  -flist file   file containing a list of XHTML file paths
  -h            this help screen
  -v            program version
  Note: The --root|r option acts as a crawler. As such use with care. Testing
        any such URL will also test any links found in the root page, and any
        subsequent pages, that match a URL that would be below the given root
        URL. External links and non-child links of the given root are not 
        tested.
        The --ulist and --url options will only test the web links listed, and
        will NOT crawl any links within the page.
HERE
    }
    print "$0 v$VERSION\n\n";
    exit(0);
}
__END__
=head1 USAGE
This program can be used in several ways to validate web pages. It will take a
root URL and crawl the website from the root and validate every page it finds
below it, it can test named URLs only. Given a root local directory it will
traverse the directory tree and validate every HTML file it finds, it will test
a single file or a list of files. In short it tries to validate web pages.
=head2 URL Options
=over
=item * -r|root url
Given a root URL will traverse the website, validating all pages found that
are below the root URL. Thus external links and those outside of the root URL
are ignored.
=item * -u|url url
Given a single URL will validating the current page only.
=item * --ulist file
The given file should contain a list of URLs (one per line), which will then be
validated. Note that only the links listed are validated, no crawling of the
links within the page is performed.
=back
=head2 File Options
=over
=item * -p|path path
Given a root directory will traverse the directory tree and validate every
.html or .htm file it finds.
=item * -f|file file
Validates a single file.
=item * -flist file
The given file should contain a list of files (one per line), which will then
be validated.
=back
=head2 Supporting Options
=over
=item * -i|ignore file
The given file should contain patterns (one per line) used to ignore URLs and
files (e.g. user login) from validation.
By default mailto links and various document and binary file formats are
ignored, together with any none 'http' protocol.
=back
=head2 Other Options
=over
=item * -h|help
Provides a help screen.
=item * -v|version
Provides the current program version
=back
=head1 BUGS, PATCHES & FIXES
There are no known bugs at the time of this release. However, if you spot a
bug or are experiencing difficulties, that is not explained within the POD
documentation, please send bug reports and patches to barbie@cpan.org.
Fixes are dependent upon their severity and my availability. Should a fix not
be forthcoming, please feel free to (politely) remind me.
=head1 SEE ALSO
L<XML::LibXML>
=head1 AUTHOR
  Barbie, <barbie@cpan.org>
  for Miss Barbell Productions <http://www.missbarbell.co.uk>.
=head1 COPYRIGHT AND LICENSE
  Copyright (C) 2008-2013 by Barbie <barbie@missbarbell.co.uk>
  This distribution is free software; you can redistribute it and/or
  modify it under the Artistic Licence v2.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)