—#!./perl
use
5.008001;
BEGIN {
pop
@INC
if
$INC
[-1] eq
'.'
}
use
strict;
use
warnings;
use
Encode;
use
Getopt::Std;
use
Carp;
use
Encode::Guess;
$Getopt::Std::STANDARD_HELP_VERSION
= 1;
my
%opt
;
getopts(
"huSs:"
, \
%opt
);
my
@suspect_list
;
list_valid_suspects() and
exit
if
$opt
{S};
@suspect_list
=
split
/:,/,
$opt
{s}
if
$opt
{s};
HELP_MESSAGE()
if
$opt
{h};
HELP_MESSAGE()
unless
@ARGV
;
do_guess(
$_
)
for
@ARGV
;
sub
read_file {
my
$filename
=
shift
;
local
$/;
open
my
$fh
,
'<:raw'
,
$filename
or croak
"$filename:$!"
;
my
$content
= <
$fh
>;
close
$fh
;
return
$content
;
}
sub
do_guess {
my
$filename
=
shift
;
my
$data
= read_file(
$filename
);
my
$enc
= guess_encoding(
$data
,
@suspect_list
);
if
( !
ref
(
$enc
) &&
$opt
{u} ) {
return
1;
}
"$filename\t"
;
if
(
ref
(
$enc
) ) {
$enc
->mime_name();
}
else
{
"unknown"
;
}
"\n"
;
return
1;
}
sub
list_valid_suspects {
join
(
"\n"
, Encode->encodings(
":all"
) );
"\n"
;
return
1;
}
sub
HELP_MESSAGE {
exec
'pod2usage'
, $0 or
die
"pod2usage: $!"
}
__END__
=head1 NAME
encguess - guess character encodings of files
=head1 VERSION
$Id: encguess,v 0.4 2023/11/10 01:10:50 dankogai Exp $
=head1 SYNOPSIS
encguess [switches] filename...
=head2 SWITCHES
=over 2
=item -h
show this message and exit.
=item -s
specify a list of "suspect encoding types" to test,
separated by either C<:> or C<,>
=item -S
output a list of all acceptable encoding types that can be used with
the -s param
=item -u
suppress display of unidentified types
=back
=head2 EXAMPLES:
=over 2
=item *
Guess encoding of a file named C<test.txt>, using only the default
suspect types.
encguess test.txt
=item *
Guess the encoding type of a file named C<test.txt>, using the suspect
types C<euc-jp,shiftjis,7bit-jis>.
encguess -s euc-jp,shiftjis,7bit-jis test.txt
encguess -s euc-jp:shiftjis:7bit-jis test.txt
=item *
Guess the encoding type of several files, do not display results for
unidentified files.
encguess -us euc-jp,shiftjis,7bit-jis test*.txt
=back
=head1 DESCRIPTION
The encoding identification is done by checking one encoding type at a
time until all but the right type are eliminated. The set of encoding
types to try is defined by the -s parameter and defaults to ascii,
utf8 and UTF-16/32 with BOM. This can be overridden by passing one or
more encoding types via the -s parameter. If you need to pass in
multiple suspect encoding types, use a quoted string with the a space
separating each value.
=head1 SEE ALSO
L<Encode::Guess>, L<Encode::Detect>
=head1 LICENSE AND COPYRIGHT
Copyright 2015 Michael LaGrasta and Dan Kogai.
This program is free software; you can redistribute it and/or modify it
under the terms of the Artistic License (2.0). You may obtain a
copy of the full license at:
=cut