package Lingua::ZH::TaBE;
$VERSION = '0.07';
use 5.005;
use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
require Exporter;
require DynaLoader;
@ISA = qw(Exporter DynaLoader);
@EXPORT_OK = qw(
TsiDBOpen
TsiInfoLookupPossibleTsiYin
TsiYinDBOpen
ChuInfoToChunkInfo
ChunkSegmentationSimplex
ChunkSegmentationComplex
ChunkSegmentationBackward
TsiInfoLookupZhiYin
YinLookupZhiList
YinToZuYinSymbolSequence
ZuYinSymbolSequenceToYin
ZuYinIndexToZuYinSymbol
ZuYinSymbolToZuYinIndex
ZozyKeyToZuYinIndex
ZhiIsBig5Code
ZhiToZhiCode
ZhiCodeToZhi
ZhiCodeToPackedBig5Code
ZhiCodeLookupRefCount
DB_TYPE_DB
DB_TYPE_LAST
DB_FLAG_OVERWRITE
DB_FLAG_CREATEDB
DB_FLAG_READONLY
DB_FLAG_NOSYNC
DB_FLAG_SHARED
DB_FLAG_NOUNPACK_YIN
);
%EXPORT_TAGS = ( all => \@EXPORT_OK );
use constant DB_TYPE_DB => 0;
use constant DB_TYPE_LAST => 1;
use constant DB_FLAG_OVERWRITE => 0x01;
use constant DB_FLAG_CREATEDB => 0x02;
use constant DB_FLAG_READONLY => 0x04;
use constant DB_FLAG_NOSYNC => 0x08;
use constant DB_FLAG_SHARED => 0x10;
use constant DB_FLAG_NOUNPACK_YIN => 0x20;
bootstrap Lingua::ZH::TaBE $VERSION;
my %cache;
sub new {
my ($class, %args) = @_;
$args{tsi_db} ||= '/usr/local/lib/tabe/tsi.db';
$args{tsi_db} ||= '/usr/local/share/tabe/tsiyin/tsi.db'
unless -e $args{tsi_db};
$args{tsi_db} ||= '/usr/local/tabe/lib/tsi.db'
unless -e $args{tsi_db};
$args{tsiyin_db} ||= '/usr/local/lib/tabe/yin.db';
$args{tsiyin_db} ||= '/usr/local/share/tabe/tsiyin/yin.db'
unless -e $args{tsiyin_db};
$args{tsi_db} ||= '/usr/local/tabe/lib/yin.db'
unless -e $args{tsi_db};
my $self = {};
$self->{tsi_db} = (
$cache{join($;, %args)} ||= Lingua::ZH::TaBE::TsiDB->new(
Lingua::ZH::TaBE::DB_TYPE_DB(),
$args{tsi_db},
Lingua::ZH::TaBE::DB_FLAG_READONLY() |
Lingua::ZH::TaBE::DB_FLAG_SHARED(),
)
) if -e $args{tsi_db};
$self->{tsiyin_db} = (
$cache{join($;, %args)} ||= Lingua::ZH::TaBE::TsiYinDB->new(
Lingua::ZH::TaBE::DB_TYPE_DB(),
$args{tsiyin_db},
Lingua::ZH::TaBE::DB_FLAG_READONLY() |
Lingua::ZH::TaBE::DB_FLAG_SHARED(),
)
) if -e $args{tsiyin_db};
return bless($self, $class);
}
sub split {
map $_->tsi,
map $_->tsis($_[1]),
shift->Chu($_[0])->chunks;
}
sub Chu {
my $class = shift;
return Lingua::ZH::TaBE::Chu->new(
join('¡A', $_[-1] =~ m/((?:[\xa1-\xf9][\x40-\x7e\xa1-\xfe])+)/g)
);
}
sub Chunk {
my $class = shift;
return Lingua::ZH::TaBE::Chunk->new(
join('', $_[-1] =~ m/((?:[\xa1-\xf9][\x40-\x7e\xa1-\xfe])+)/g)
);
}
sub Tsi {
my $self = shift;
return Lingua::ZH::TaBE::Tsi->new(
($_[0] =~ m/((?:[\xa1-\xf9][\x40-\x7e\xa1-\xfe])+)/) ? $1 : ''
);
}
sub Zhi {
my $class = shift;
return Lingua::ZH::TaBE::Zhi->new(
$_[0] =~ /^\d+$/
? Lingua::ZH::TaBE::ZhiCodeToZhi($_[0])
: $_[0] =~ m/((?:[\xa1-\xf9][\x40-\x7e\xa1-\xfe]))/
);
}
sub Yin {
my $class = shift;
return Lingua::ZH::TaBE::Yin->new(
$_[0] =~ /^\d+$/
? $_[0]
: Lingua::ZH::TaBE::ZuYinSymbolSequenceToYin($_[0])
);
}
sub ZuYin {
my $class = shift;
return Lingua::ZH::TaBE::ZuYin->new(
$_[0] =~ /^\d+$/
? $_[0]
: Lingua::ZH::TaBE::ZuYinSymbolToZuYinIndex($_[0])
);
}
sub ZozyKey {
my $class = shift;
return Lingua::ZH::TaBE::ZuYin->new(
ZozyKeyToZuYinIndex($_[0])
);
}
sub TsiDB { shift->{tsi_db} }
sub TsiYinDB { shift->{tsiyin_db} }
package Lingua::ZH::TaBE::Chu;
use overload '""' => sub { shift->chu }, fallback => 1;
sub chunks {
my $chu = shift;
$chu->ToChunkInfo if $chu->num_chunk <= 0;
return unless defined wantarray;
wantarray ? $chu->chunk : [ $chu->chunk ];
}
package Lingua::ZH::TaBE::Chunk;
use overload '""' => sub { shift->chunk }, fallback => 1;
my %methods = (
s => 'SegmentationSimplex',
c => 'SegmentationComplex',
b => 'SegmentationBackward',
);
sub tsis {
my $chunk = shift;
$chunk->Segmentation(@_) if $chunk->num_tsi <= 0;
return unless defined wantarray;
wantarray ? $chunk->tsi : [ $chunk->tsi ];
}
sub Segmentation {
my $chunk = shift;
if ($chunk->chunk =~ /^[\xa4-\xf9]/) {
my $method = shift || 's';
my $func = $methods{lc(substr($method, 0, 1))}
or die "Unknown segmentation method: $method";
$chunk->$func(@_);
}
}
package Lingua::ZH::TaBE::Tsi;
use overload '""' => sub { shift->tsi }, fallback => 1;
sub zhis {
my $tsi = shift;
return unless defined wantarray;
wantarray ? (
map Lingua::ZH::TaBE->Zhi($_),
$tsi->tsi =~ m/([\xa1-\xf9][\x40-\x7e\xa1-\xfe])/g
) : [
map Lingua::ZH::TaBE->Zhi($_),
$tsi->tsi =~ m/([\xa1-\xf9][\x40-\x7e\xa1-\xfe])/g
];
}
sub yins {
my $tsi = shift;
$tsi->LookupZhiYin(@_) unless $tsi->yinnum;
return unless defined wantarray;
wantarray ? $tsi->yindata : [ $tsi->yindata ];
};
package Lingua::ZH::TaBE::Zhi;
use overload '0+' => sub { shift->ToZhiCode },
'""' => sub { shift->zhi },
fallback => 1;
sub yins {
my $tsi = Lingua::ZH::TaBE->Tsi(shift->zhi);
$tsi->LookupZhiYin(@_);
return unless defined wantarray;
wantarray ? $tsi->yindata : [ $tsi->yindata ];
}
sub zhi { ${+shift} }
sub new {
bless(\$_[1], $_[0]);
}
sub IsBig5Code {
Lingua::ZH::TaBE::ZhiIsBig5Code(shift->zhi)
}
sub ToZhiCode {
Lingua::ZH::TaBE::ZhiToZhiCode(shift->zhi)
}
sub ToZhi {
shift->zhi
}
sub ToPackedBig5Code {
Lingua::ZH::TaBE::ZhiCodeToPackedBig5Code(shift->ToZhiCode)
}
sub LookupRefCount {
Lingua::ZH::TaBE::ZhiCodeLookupRefCount(shift->ToZhiCode)
}
package Lingua::ZH::TaBE::Yin;
use overload '0+' => sub { shift->yin },
'""' => sub { shift->ToZuYinSymbolSequence },
fallback => 1;
sub yin { ${+shift} }
sub ToZuYinSymbolSequence {
Lingua::ZH::TaBE::YinToZuYinSymbolSequence(shift->yin)
}
sub new { bless(\$_[1], $_[0]) }
sub zuyins {
return unless defined wantarray;
wantarray ? (
map Lingua::ZH::TaBE->ZuYin($_),
shift->ToZuYinSymbolSequence =~ m/([\xa1-\xf9][\x40-\x7e\xa1-\xfe])/g
) : [
map Lingua::ZH::TaBE->ZuYin($_),
shift->ToZuYinSymbolSequence =~ m/([\xa1-\xf9][\x40-\x7e\xa1-\xfe])/g
];
}
sub zhis {
return unless defined wantarray;
wantarray ? (
map Lingua::ZH::TaBE->Zhi($_),
shift->LookupZhiList =~ m/([\xa1-\xf9][\x40-\x7e\xa1-\xfe])/g
) : [
map Lingua::ZH::TaBE->Zhi($_),
shift->LookupZhiList =~ m/([\xa1-\xf9][\x40-\x7e\xa1-\xfe])/g
];
}
sub LookupZhiList {
return Lingua::ZH::TaBE::YinLookupZhiList(shift->yin);
}
sub ToYin {
shift->yin
}
package Lingua::ZH::TaBE::ZuYin;
use overload '0+' => sub { shift->zuyin },
'""' => sub { shift->zhi },
fallback => 1;
sub zuyin { ${+shift} }
sub yin {
Lingua::ZH::TaBE->Yin(
shift->ToZuYinSymbol->zhi
);
}
sub new {
bless(\$_[1], $_[0]);
}
sub zhi {
Lingua::ZH::TaBE->Zhi(
Lingua::ZH::TaBE::ZuYinIndexToZuYinSymbol(shift->zuyin)
);
}
sub ToZuYinSymbol {
shift->zhi
}
sub ToZuYinIndex {
shift->zuyin
}
1;
__END__
=encoding big5
=head1 NAME
Lingua::ZH::TaBE - Chinese processing via libtabe
=head1 VERSION
This document describes version 0.07 of Lingua::ZH::TaBE, released
December 31, 2005.
=head1 SYNOPSIS
use Lingua::ZH::TaBE;
my $tabe = Lingua::ZH::TaBE->new;
# Phrase splitter
my @phrases = $tabe->split(
"·í§Ú̦b¹q¸£¤¤³B²z¤¤¤å¸ê°T®É¡A¬Û«H¨ä¤¤³Ì´o¤Hªº".
"ª¬ªp¤§¤@¡A²ö¹L©ó·Q¥´ªº¦r¥´¤£¥X¨Ó¤F¡C"
);
# Chaining various components
print $tabe->Chu("¹D¥i¹D¡A«D±`¹D¡C") # sentence
->chunks->[2] # «D±`¹D # chunk
->tsis->[0] # «D±` # phrase
->zhis->[1] # ±` # character
->yins->[0] # £¥£µ£½ # pronounciation
->zuyins->[0], # £¥ # phonetic symbols
=head1 DESCRIPTION
This module is a Perl interface to the B<TaBE> (Taiwan and Big5
Encoding) library, an unified interface and library dealing with Chinese
words, phrases, sentences, and phonetic symbols; it is intended to be
used as the foundation of Chinese text processing.
B<Lingua::ZH::TaBE> provides an object-oriented interface (preferred),
as well as a procedural interface consisting of all C functions in
C<tabe.h>.
=head1 Object-Oriented Interface
=head2 Lingua::ZH::TaBE
=over 4
=item new( [tsi_db => $file, tsiyin_db => $file] )
Creates a LibTaBE handle and opens databases. If unspecified, find in
the usual libtabe data directory automatically.
=item split( $string [, $method] )
Split the text in C<$string>; returns a list of strings representing the
words obtained. You may specify C<Complex> or C<Backward> as C<$method>
to use an alternate segmentation algorithm.
=item Chu(), Chunk(), Tsi(), Zhi(), Yin(), ZuYin()
Constructors for various level of objects, each taking one argument for
initialization.
=back
=head2 Lingua::ZH::TaBE::Chu
=over 4
=item chunks()
=back
=head2 Lingua::ZH::TaBE::Chunk
=over 4
=item tsis([$method])
=back
=head2 Lingua::ZH::TaBE::Tsi
=over 4
=item zhis()
=item yins()
=back
=head2 Lingua::ZH::TaBE::Zhi
=over 4
=item yins()
=item ToZhi()
=item ToZhiCode()
=item IsBig5Code()
=item ToPackedBig5Code()
=item LookupRefCount()
=back
=head2 Lingua::ZH::TaBE::Yin
=over 4
=item zuyins()
=item zhis()
=item ToYin()
=item ToZuYinSymbolSequence()
=back
=head2 Lingua::ZH::TaBE::ZuYin
=over 4
=item yin()
=item zhi()
=back
=head1 Procedural Interface
All functions below belong to the B<Lingua::ZH::TaBE> class; they are
not exported by default, but may be imported explicitly, or implicitly
via C<use Lingua::ZH::TaBE ':all'>.
$TsiDB = TsiDBOpen($type, $db_name, $flags);
$num = TsiInfoLookupPossibleTsiYin($TsiDB, $Tsi);
$TsiYinDB = TsiYinDBOpen($type, $db_name, $flags);
$num = ChuInfoToChunkInfo($Chu);
$num = ChunkSegmentationSimplex($TsiDB, $Chunk);
$num = ChunkSegmentationComplex($TsiDB, $Chunk);
$num = ChunkSegmentationBackward($TsiDB, $Chunk);
$num = TsiInfoLookupZhiYin($TsiDB, $Tsi);
$string = YinLookupZhiList($Yin);
$string = YinToZuYinSymbolSequence($Yin);
$yin = ZuYinSymbolSequenceToYin($string);
$zhi = ZuYinIndexToZuYinSymbol($ZuYin);
$zuyin = ZuYinSymbolToZuYinIndex($Zhi);
$zuyin = ZozyKeyToZuYinIndex($key);
$num = ZhiIsBig5Code($Zhi);
$zhicode = ZhiToZhiCode($Zhi);
$zhi = ZhiCodeToZhi($zhicode);
$num = ZhiCodeToPackedBig5Code($zhicode);
$num = ZhiCodeLookupRefCount($zhicode);
=head1 Constants
All constants below belong to the B<Lingua::ZH::TaBE> class; they are
not exported by default, but may be imported explicitly, or implicitly
via C<use Lingua::ZH::TaBE ':all'>.
DB_TYPE_DB 0
DB_TYPE_LAST 1
DB_FLAG_OVERWRITE 0x01
DB_FLAG_CREATEDB 0x02
DB_FLAG_READONLY 0x04
DB_FLAG_NOSYNC 0x08
DB_FLAG_SHARED 0x10
DB_FLAG_NOUNPACK_YIN 0x20
=head1 CAVEATS
The B<TsiYin> family of functions are yet incomplete.
=head1 SEE ALSO
L<ftp://xcin.linux.org.tw/pub/xcin/libtabe/devel/>
L<http://libtabe.sourceforge.net/>
=head1 AUTHORS
Audrey Tang E<lt>autrijus@autrijus.orgE<gt>
=head1 COPYRIGHT
Copyright 2003, 2004, 2005 by Audrey Tang E<lt>autrijus@autrijus.orgE<gt>.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
See L<http://www.perl.com/perl/misc/Artistic.html>
=cut