my
%CharsetMaps
= (
'iso-8859-1'
=>
'MHonArc/UTF8/ISO8859_1.pm'
,
'iso-8859-2'
=>
'MHonArc/UTF8/ISO8859_2.pm'
,
'iso-8859-3'
=>
'MHonArc/UTF8/ISO8859_3.pm'
,
'iso-8859-4'
=>
'MHonArc/UTF8/ISO8859_4.pm'
,
'iso-8859-5'
=>
'MHonArc/UTF8/ISO8859_5.pm'
,
'iso-8859-6'
=>
'MHonArc/UTF8/ISO8859_6.pm'
,
'iso-8859-7'
=>
'MHonArc/UTF8/ISO8859_7.pm'
,
'iso-8859-8'
=>
'MHonArc/UTF8/ISO8859_8.pm'
,
'iso-8859-9'
=>
'MHonArc/UTF8/ISO8859_9.pm'
,
'iso-8859-10'
=>
'MHonArc/UTF8/ISO8859_10.pm'
,
'iso-8859-11'
=>
'MHonArc/UTF8/ISO8859_11.pm'
,
'iso-8859-13'
=>
'MHonArc/UTF8/ISO8859_13.pm'
,
'iso-8859-14'
=>
'MHonArc/UTF8/ISO8859_14.pm'
,
'iso-8859-15'
=>
'MHonArc/UTF8/ISO8859_15.pm'
,
'iso-8859-16'
=>
'MHonArc/UTF8/ISO8859_16.pm'
,
'cp866'
=>
'MHonArc/UTF8/CP866.pm'
,
'cp949'
=>
'MHonArc/UTF8/CP949.pm'
,
'cp932'
=>
'MHonArc/UTF8/CP932.pm'
,
'cp936'
=>
'MHonArc/UTF8/CP936.pm'
,
'cp950'
=>
'MHonArc/UTF8/CP950.pm'
,
'cp1250'
=>
'MHonArc/UTF8/CP1250.pm'
,
'cp1251'
=>
'MHonArc/UTF8/CP1251.pm'
,
'cp1252'
=>
'MHonArc/UTF8/CP1252.pm'
,
'cp1253'
=>
'MHonArc/UTF8/CP1253.pm'
,
'cp1254'
=>
'MHonArc/UTF8/CP1254.pm'
,
'cp1255'
=>
'MHonArc/UTF8/CP1255.pm'
,
'cp1256'
=>
'MHonArc/UTF8/CP1256.pm'
,
'cp1257'
=>
'MHonArc/UTF8/CP1257.pm'
,
'cp1258'
=>
'MHonArc/UTF8/CP1258.pm'
,
'koi-0'
=>
'MHonArc/UTF8/KOI_0.pm'
,
'koi-7'
=>
'MHonArc/UTF8/KOI_7.pm'
,
'koi8-a'
=>
'MHonArc/UTF8/KOI8_A.pm'
,
'koi8-b'
=>
'MHonArc/UTF8/KOI8_B.pm'
,
'koi8-e'
=>
'MHonArc/UTF8/KOI8_E.pm'
,
'koi8-f'
=>
'MHonArc/UTF8/KOI8_F.pm'
,
'koi8-r'
=>
'MHonArc/UTF8/KOI8_R.pm'
,
'koi8-u'
=>
'MHonArc/UTF8/KOI8_U.pm'
,
'gost19768-87'
=>
'MHonArc/UTF8/GOST19768_87.pm'
,
'viscii'
=>
'MHonArc/UTF8/VISCII.pm'
,
'macarabic'
=>
'MHonArc/UTF8/AppleArabic.pm'
,
'maccentraleurroman'
=>
'MHonArc/UTF8/AppleCenteuro.pm'
,
'maccroatian'
=>
'MHonArc/UTF8/AppleCroatian.pm'
,
'maccyrillic'
=>
'MHonArc/UTF8/AppleCyrillic.pm'
,
'macgreek'
=>
'MHonArc/UTF8/AppleGreek.pm'
,
'machebrew'
=>
'MHonArc/UTF8/AppleHebrew.pm'
,
'macicelandic'
=>
'MHonArc/UTF8/AppleIceland.pm'
,
'macromanian'
=>
'MHonArc/UTF8/AppleRomanian.pm'
,
'macroman'
=>
'MHonArc/UTF8/AppleRoman.pm'
,
'macthai'
=>
'MHonArc/UTF8/AppleThai.pm'
,
'macturkish'
=>
'MHonArc/UTF8/AppleTurkish.pm'
,
'big5-eten'
=>
'MHonArc/UTF8/BIG5_ETEN.pm'
,
'big5-hkscs'
=>
'MHonArc/UTF8/BIG5_HKSCS.pm'
,
'gb2312'
=>
'MHonArc/UTF8/GB2312.pm'
,
'euc-jp'
=>
'MHonArc/UTF8/EUC_JP.pm'
,
'hp-roman8'
=>
'MHonArc/UTF8/HP_ROMAN8.pm'
,
);
my
$char_maps
= MHonArc::CharMaps->new(\
%CharsetMaps
);
my
$utf8_re
=
q/[\x00-\x7F]|
[\xC0-\xDF][\x00-\xFF]|
[\xE0-\xEF][\x00-\xFF]{2}|
[\xF0-\xF7][\x00-\xFF]{3}|
[\xF8-\xFB][\x00-\xFF]{4}|
[\xFC\xFD][\x00-\xFF]{5}|
[\x80-\xFF]/
;
sub
utf8_length {
my
$n
= 0;
while
(
$_
[0] =~ m/(
$utf8_re
)/gox) { ++
$n
; };
$n
;
}
sub
clip {
my
$str
=
shift
;
my
$len
=
shift
;
my
$is_html
=
shift
;
my
$has_tags
=
shift
;
if
(!
$is_html
) {
return
$str
if
length
(
$str
) <=
$len
;
$str
=~ m/^((?:
$utf8_re
){1,
$len
})/x;
return
$1;
}
$str
=~ s/<[^>]*>//g
if
$has_tags
;
return
$str
if
length
(
$str
) <=
$len
;
my
(
$utf8_len
,
$er_len
);
my
$text
=
""
;
my
$subtext
=
""
;
my
$sub_len
=
$len
;
my
$real_len
= 0;
while
(
$str
ne
''
) {
if
(!(
$str
=~ s/^((?:
$utf8_re
){1,
$sub_len
})//x)) {
warn
qq/Warning: MHonArc::UTF8::MhaEncode::clip:/
,
qq/ Internal error/
;
return
$text
.
$str
;
}
$subtext
= $1;
if
((
$str
ne
''
) && (
$subtext
=~ /\&[^;]*\Z/)) {
if
(
$str
=~ s/^([^;]*;)//) {
$subtext
.= $1;
}
else
{
warn
qq/Warning: MHonArc::UTF8::MhaEncode::clip: malformed/
,
qq/ entity reference detected\n/
;
$subtext
.=
$str
;
$str
=
''
;
}
}
$er_len
= 0;
while
(
$subtext
=~ /(\&[^;]+);/g) {
$er_len
+=
length
($1);
}
$utf8_len
= utf8_length(
$subtext
);
$real_len
+=
$utf8_len
-
$er_len
;
$text
.=
$subtext
;
last
if
(
$real_len
>=
$len
);
$sub_len
=
$len
-
$real_len
;
}
$text
;
}
sub
to_utf8 {
my
$data
=
shift
;
my
$charset
=
lc
shift
;
my
$data_r
=
ref
(
$data
) ?
$data
: \
$data
;
return
$$data_r
if
(
$charset
eq
'us-ascii'
||
$charset
eq
'utf-8'
||
$charset
eq
'utf8'
);
MHonArc::Char::map_conv(
$data_r
,
$charset
,
$char_maps
);
}
sub
str2sgml {
my
$data
=
shift
;
my
$charset
=
lc
shift
;
my
$data_r
=
ref
(
$data
) ?
$data
: \
$data
;
if
(
$charset
eq
'us-ascii'
) {
if
(
$$data_r
=~ /[\x80-\xFF]/) {
$charset
=
'iso-8859-1'
;
}
else
{
$$data_r
=~ s/([
$HTMLSpecials
])/
$HTMLSpecials
{$1}/go;
return
$$data_r
;
}
}
if
(
$charset
eq
'utf-8'
||
$charset
eq
'utf8'
) {
$$data_r
=~ s/([
$HTMLSpecials
])/
$HTMLSpecials
{$1}/go;
return
$$data_r
;
}
MHonArc::Char::map_conv(
$data_r
,
$charset
,
$char_maps
);
$$data_r
=~ s/([
$HTMLSpecials
])/
$HTMLSpecials
{$1}/go;
$$data_r
;
}
1;