##---------------------------------------------------------------------------##
## File:
## $Id: CharEnt.pm,v 1.3 2002/04/13 00:58:09 ehood Exp $
## Author:
## Earl Hood earl@earlhood.com
## Description:
## Module to deal with 8-bit character data conversion to
## (SGML) entity references.
##---------------------------------------------------------------------------##
## Copyright (C) 1997-2002 Earl Hood, earl@earlhood.com
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
## 02111-1307, USA
##---------------------------------------------------------------------------##
package
MHonArc::CharEnt;
use
strict;
##---------------------------------------------------------------------------
## US-ASCII/Common characters
##---------------------------------------------------------------------------
my
%ASCIIMap
= (
#--------------------------------------------------------------------------
# Hex Code Entity Ref # ISO external entity and description
#--------------------------------------------------------------------------
0x22
=>
"quot"
,
# ISOnum : Quotation mark
0x26
=>
"amp"
,
# ISOnum : Ampersand
0x3C
=>
"lt"
,
# ISOnum : Less-than sign
0x3E
=>
"gt"
,
# ISOnum : Greater-than sign
0xA0
=>
"nbsp"
,
# ISOnum : NO-BREAK SPACE
);
my
%ASCIIMapReverse
=
reverse
%ASCIIMap
;
##---------------------------------------------------------------------------
## Loaded Maps
##---------------------------------------------------------------------------
# character => entity
my
%char2ent_maps
= (
'us-ascii'
=> \
%ASCIIMap
,
);
# entity => character
my
%ent2char_maps
= (
'us-ascii'
=> \
%ASCIIMapReverse
,
);
##---------------------------------------------------------------------------
## Charset specification to mapping
##---------------------------------------------------------------------------
my
%CharsetMaps
= (
'iso-8859-1'
=>
'MHonArc/CharEnt/ISO8859_1.pm'
,
'iso-8859-2'
=>
'MHonArc/CharEnt/ISO8859_2.pm'
,
'iso-8859-3'
=>
'MHonArc/CharEnt/ISO8859_3.pm'
,
'iso-8859-4'
=>
'MHonArc/CharEnt/ISO8859_4.pm'
,
'iso-8859-5'
=>
'MHonArc/CharEnt/ISO8859_5.pm'
,
'iso-8859-6'
=>
'MHonArc/CharEnt/ISO8859_6.pm'
,
'iso-8859-7'
=>
'MHonArc/CharEnt/ISO8859_7.pm'
,
'iso-8859-8'
=>
'MHonArc/CharEnt/ISO8859_8.pm'
,
'iso-8859-9'
=>
'MHonArc/CharEnt/ISO8859_9.pm'
,
'iso-8859-10'
=>
'MHonArc/CharEnt/ISO8859_10.pm'
,
'iso-8859-15'
=>
'MHonArc/CharEnt/ISO8859_15.pm'
,
'latin1'
=>
'MHonArc/CharEnt/ISO8859_1.pm'
,
'latin2'
=>
'MHonArc/CharEnt/ISO8859_2.pm'
,
'latin3'
=>
'MHonArc/CharEnt/ISO8859_3.pm'
,
'latin4'
=>
'MHonArc/CharEnt/ISO8859_4.pm'
,
'latin5'
=>
'MHonArc/CharEnt/ISO8859_9.pm'
,
'latin6'
=>
'MHonArc/CharEnt/ISO8859_10.pm'
,
'latin9'
=>
'MHonArc/CharEnt/ISO8859_15.pm'
,
'windows-1250'
=>
'MHonArc/CharEnt/CP1250.pm'
,
'windows-1252'
=>
'MHonArc/CharEnt/CP1252.pm'
,
);
my
%ReverseCharsetMaps
= (
'iso-8859-1'
=>
'MHonArc/CharEnt/ISO8859_1R.pm'
,
'iso-8859-3'
=>
'MHonArc/CharEnt/ISO8859_3R.pm'
,
'iso-8859-7'
=>
'MHonArc/CharEnt/ISO8859_7R.pm'
,
'iso-8859-8'
=>
'MHonArc/CharEnt/ISO8859_8R.pm'
,
'iso-8859-9'
=>
'MHonArc/CharEnt/ISO8859_9R.pm'
,
'iso-8859-15'
=>
'MHonArc/CharEnt/ISO8859_15R.pm'
,
'latin1'
=>
'MHonArc/CharEnt/ISO8859_1R.pm'
,
'latin3'
=>
'MHonArc/CharEnt/ISO8859_3R.pm'
,
'latin5'
=>
'MHonArc/CharEnt/ISO8859_9R.pm'
,
'latin9'
=>
'MHonArc/CharEnt/ISO8859_15R.pm'
,
);
###############################################################################
## Routines
###############################################################################
##---------------------------------------------------------------------------##
## str2sgml converts a string encoded by $charset to an sgml
## string where special characters are converted to entity
## references.
##
## $return_data = MHonArc::CharEnt::str2sgml($data, $charset, $only8bit);
##
## If $only8bit is non-zero, than only 8-bit characters are
## translated.
##
sub
str2sgml {
my
$data
=
shift
;
my
$charset
=
lc
shift
;
my
$only8bit
=
shift
;
my
(
$ret
,
$offset
,
$len
) = (
''
, 0, 0);
my
(
$map
,
$char
);
$charset
=~
tr
/_/-/;
# Get mapping
$map
=
$char2ent_maps
{
$charset
};
$map
= _load_charmap(
$charset
)
unless
defined
$map
;
# Convert string
$len
=
length
(
$data
);
while
(
$offset
<
$len
) {
$char
=
unpack
(
"C"
,
substr
(
$data
,
$offset
++, 1));
if
(
$only8bit
&&
$char
< 0xA0) {
$ret
.=
pack
(
"C"
,
$char
);
}
elsif
(
$map
->{
$char
}) {
$ret
.=
join
(
''
,
'&'
,
$map
->{
$char
},
';'
);
}
elsif
(
$ASCIIMap
{
$char
}) {
$ret
.=
join
(
''
,
'&'
,
$ASCIIMap
{
$char
},
';'
);
}
else
{
$ret
.=
pack
(
"C"
,
$char
);
}
}
$ret
;
}
##---------------------------------------------------------------------------##
## sgml2str converts a string with sdata character entity references
## to the raw character values denoted by a character set.
##
## $return_data = MHonArc::CharEnt::sgml2str($data, $charset);
##
sub
sgml2str {
my
$data
=
shift
;
my
$charset
=
lc
shift
;
my
(
$map
);
$charset
=~
tr
/_/-/;
# Get mapping
$map
=
$ent2char_maps
{
$charset
};
$map
= _reverse_load_charmap(
$charset
)
unless
defined
$map
;
# Convert character entites to raw values
$data
=~ s/\&([\w\.\-]+);
/
defined
(
$map
->{$1}) ?
sprintf
(
"%c"
,
$map
->{$1}) :
defined
(
$ASCIIMapReverse
{$1}) ?
sprintf
(
"%c"
,
$ASCIIMapReverse
{$1}) :
"&$1;"
/gex;
$data
;
}
##---------------------------------------------------------------------------##
sub
_load_charmap {
my
$charset
=
shift
;
my
$map
=
undef
;
my
$file
=
$CharsetMaps
{
$charset
};
if
(!
defined
(
$file
)) {
warn
'Warning: MHonArc::CharEnt: Unknown charset: '
,
$charset
,
"\n"
;
$map
=
$char2ent_maps
{
$charset
} = { };
}
else
{
delete
$INC
{
$file
};
eval
{
$map
=
$char2ent_maps
{
$charset
} =
require
$file
;
};
if
($@) {
warn
'Warning: MHonArc::CharEnt: '
, $@,
"\n"
;
$map
=
$char2ent_maps
{
$charset
} = { };
}
}
$map
;
}
sub
_reverse_load_charmap {
my
$charset
=
shift
;
my
$map
=
undef
;
my
$file
=
$ReverseCharsetMaps
{
$charset
};
if
(!
defined
(
$file
)) {
if
(!
defined
(
$map
=
$char2ent_maps
{
$charset
})) {
$map
= _load_charmap(
$charset
);
}
$map
=
$ent2char_maps
{
$charset
} = {
reverse
%$map
};
}
else
{
delete
$INC
{
$file
};
eval
{
$map
=
$ent2char_maps
{
$charset
} =
require
$file
;
};
if
($@) {
warn
'Warning: MHonArc::CharEnt: '
, $@,
"\n"
;
$map
=
$ent2char_maps
{
$charset
} = { };
}
}
$map
;
}
##---------------------------------------------------------------------------##
1;