#!/usr/bin/perl
setlocale(
"LC_CTYPE"
,
"en_US"
);
$charset
=
"A-Za-z\xa0-\xbf\xc0-\xd6\xda-\xdd\xdf-\xf6\xf9-\xfd\xff'\\-"
;
$syllables
=
"../Werder/data/syllables"
;
sub
parse($) {
my
$werd
=
shift
;
my
$start_at
=
shift
;
my
(
$syl
,
$ready
,
$next_syl
);
if
(
$werd
eq
""
) {
return
(1); }
$werd
=~ /^((.).?)/;
my
$first
=
lc
($2);
my
$firsttwo
=
lc
($1);
foreach
$syl
(@{
$indexed_syllables
{
$first
}{
$first
}}, @{
$indexed_syllables
{
$first
}{
$firsttwo
}}) {
$next_syl
= 0;
if
(
$start_at
&& !
$ready
) {
$next_syl
= 1; }
if
(
$syl
eq
$start_at
) {
$ready
= 1; }
next
if
$next_syl
;
if
(
$werd
=~ /^
$syl
(.*)$/si) {
push
@werd_parts
,
$syl
;
my
$ret
= parse($1);
return
(
$ret
)
if
$ret
;
}
}
if
(
scalar
(
@werd_parts
)) {
my
$oldsyl
=
pop
@werd_parts
;
print
"* going back from $oldsyl\n"
if
$debug
;
my
$ret
= parse(
$oldsyl
.
$werd
,
$oldsyl
);
return
(
$ret
)
if
$ret
;
}
else
{
return
(0);
}
}
open
SYLS,
$syllables
;
chomp
(
@syllables
= <SYLS>);
close
SYLS;
@syllables
=
sort
{
$min
= (
length
(
$a
) <
length
(
$b
)) ?
length
(
$a
) :
length
(
$b
);
(
substr
(
$a
,0,
$min
) cmp
substr
(
$b
,0,
$min
) ||
length
(
$b
) <=>
length
(
$a
) )
}
@syllables
;
foreach
$syl
(
@syllables
) {
$syl
=~ /^((.).?)/;
my
$first
=
lc
($2);
my
$firsttwo
=
lc
($1);
push
@{
$indexed_syllables
{
$first
}{
$firsttwo
}},
$syl
;
}
while
(<>) {
$line
=
$_
;
while
(
$line
=~ /[^
$charset
]*([
$charset
]+)[^
$charset
]*/sig) {
$werd
= $1;
$werd
=~ s/^['\-]*//;
$werd
=~ s/['\-]*$//;
if
(
$werd
eq
""
) {
next
; }
if
(!parse(
$werd
)) {
print
"* couldnt parse $werd\n"
;
}
else
{
my
$werd_sep
=
join
' '
,
@werd_parts
;
}
undef
@werd_parts
;
}
}