#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#define NEED_sv_2pv_flags
#define NEED_sv_2pv_nolen
#include "../ppport.h"
#define NUMLANG 15
#define NUMSTEM 29
#include "include/libstemmer.h"
/* All Lingua::Stem::Snowball objects and all calls to stem(),
* stem_in_place(), etc, reference the same set of Snowball struct
* sb_stemmers, all held in the singleton object
* $Lingua::Stem::Snowball::stemmifier, of class
* Lingua::Stem::Snowball::Stemmifier. Each sb_stemmer is created lazily, as
* soon as there is a need for its unique combination of language and
* encoding. They are destroyed during global cleanup, when
* $Lingua::Stem::Snowball::stemmifier is reclaimed.
*/
typedef struct Stemmifier {
struct sb_stemmer **stemmers;
} Stemmifier;
typedef struct LangEnc {
char *lang;
char *encoding; /* the real name of the encoding */
char *snowenc; /* the variant that libstemmer_c needs */
} LangEnc;
LangEnc lang_encs[] = {
{ "da", "ISO-8859-1", "ISO_8859_1" },
{ "de", "ISO-8859-1", "ISO_8859_1" },
{ "nl", "ISO-8859-1", "ISO_8859_1" },
{ "en", "ISO-8859-1", "ISO_8859_1" },
{ "es", "ISO-8859-1", "ISO_8859_1" },
{ "fi", "ISO-8859-1", "ISO_8859_1" },
{ "fr", "ISO-8859-1", "ISO_8859_1" },
{ "hu", "ISO-8859-1", "ISO_8859_1" },
{ "it", "ISO-8859-1", "ISO_8859_1" },
{ "no", "ISO-8859-1", "ISO_8859_1" },
{ "pt", "ISO-8859-1", "ISO_8859_1" },
{ "ro", "ISO-8859-2", "ISO_8859_2" },
{ "ru", "KOI8-R", "KOI8_R", },
{ "sv", "ISO-8859-1", "ISO_8859_1" },
{ "tr", "UTF-8", "UTF_8" },
{ "da", "UTF-8", "UTF_8" },
{ "de", "UTF-8", "UTF_8" },
{ "nl", "UTF-8", "UTF_8" },
{ "en", "UTF-8", "UTF_8" },
{ "es", "UTF-8", "UTF_8" },
{ "fi", "UTF-8", "UTF_8" },
{ "fr", "UTF-8", "UTF_8" },
{ "hu", "UTF-8", "UTF_8" },
{ "it", "UTF-8", "UTF_8" },
{ "no", "UTF-8", "UTF_8" },
{ "pt", "UTF-8", "UTF_8" },
{ "ro", "UTF-8", "UTF_8" },
{ "ru", "UTF-8", "UTF_8" },
{ "sv", "UTF-8", "UTF_8" },
};
MODULE = Lingua::Stem::Snowball PACKAGE = Lingua::Stem::Snowball
PROTOTYPES: disable
BOOT:
{
SV *sb_stemmer_list_sv = newSViv(PTR2IV(sb_stemmer_list));
SV *sb_stemmer_new_sv = newSViv(PTR2IV(sb_stemmer_new));
SV *sb_stemmer_delete_sv = newSViv(PTR2IV(sb_stemmer_delete));
SV *sb_stemmer_stem_sv = newSViv(PTR2IV(sb_stemmer_stem));
SV *sb_stemmer_length_sv = newSViv(PTR2IV(sb_stemmer_length));
hv_store(PL_modglobal, "Lingua::Stem::Snowball::sb_stemmer_list", 39,
sb_stemmer_list_sv, 0);
hv_store(PL_modglobal, "Lingua::Stem::Snowball::sb_stemmer_new", 38,
sb_stemmer_new_sv, 0);
hv_store(PL_modglobal, "Lingua::Stem::Snowball::sb_stemmer_delete", 41,
sb_stemmer_delete_sv, 0);
hv_store(PL_modglobal, "Lingua::Stem::Snowball::sb_stemmer_stem", 39,
sb_stemmer_stem_sv, 0);
hv_store(PL_modglobal, "Lingua::Stem::Snowball::sb_stemmer_length", 41,
sb_stemmer_length_sv, 0);
}
void
_derive_stemmer(self_hash)
HV *self_hash;
PPCODE:
{
SV **sv_ptr;
char *lang;
char *encoding;
int i;
int stemmer_id = -1;
/* Extract lang and encoding member variables. */
sv_ptr = hv_fetch(self_hash, "lang", 4, 0);
if (!sv_ptr)
croak("Couldn't find member variable 'lang'");
lang = SvPV_nolen(*sv_ptr);
sv_ptr = hv_fetch(self_hash, "encoding", 8, 0);
if (!sv_ptr)
croak("Couldn't find member variable 'encoding'");
encoding = SvPV_nolen(*sv_ptr);
/* See if the combo of lang and encoding is supported. */
for(i = 0; i < NUMSTEM; i++) {
if ( strcmp(lang, lang_encs[i].lang) == 0
&& strcmp(encoding, lang_encs[i].encoding) == 0
) {
Stemmifier *stemmifier;
SV *stemmifier_sv;
/* We have a match, so we know the stemmer id now. */
stemmer_id = i;
/* Retrieve communal Stemmifier. */
stemmifier_sv = get_sv("Lingua::Stem::Snowball::stemmifier", TRUE);
if ( sv_isobject(stemmifier_sv)
&& sv_derived_from(stemmifier_sv,
"Lingua::Stem::Snowball::Stemmifier")
) {
IV tmp = SvIV(SvRV(stemmifier_sv));
stemmifier = INT2PTR(Stemmifier*, tmp);
}
else {
croak("$L::S::S::stemmifier isn't a Stemmifier");
}
/* Construct a stemmer for lang/enc if there isn't one yet. */
if ( ! stemmifier->stemmers[stemmer_id] ) {
stemmifier->stemmers[stemmer_id]
= sb_stemmer_new(lang, lang_encs[stemmer_id].snowenc);
if ( ! stemmifier->stemmers[stemmer_id] ) {
croak("Failed to allocate an sb_stemmer for %s %s", lang,
encoding);
}
}
break;
}
}
/* Set the value of $self->{stemmer_id}. */
sv_ptr = hv_fetch(self_hash, "stemmer_id", 10, 0);
if (!sv_ptr)
croak("Couldn't access $self->{stemmer_id}");
sv_setiv(*sv_ptr, stemmer_id);
}
bool
_validate_language(language)
char *language;
CODE:
{
int i;
RETVAL = FALSE;
for (i = 0; i < NUMLANG; i++) {
if ( strcmp(language, lang_encs[i].lang) == 0 ) RETVAL = TRUE;
}
}
OUTPUT: RETVAL
void
stemmers(...)
PPCODE:
{
int i;
for (i = 0; i < NUMLANG; i++) {
XPUSHs( sv_2mortal(
newSVpvn( lang_encs[i].lang, strlen(lang_encs[i].lang) )
));
}
XSRETURN(NUMLANG);
}
void
stem_in_place(self_hash, words_av)
HV *self_hash;
AV *words_av;
PPCODE:
{
IV stemmer_id;
SV **sv_ptr;
Stemmifier *stemmifier;
SV *stemmifier_sv;
/* Retrieve the stemmifier. */
stemmifier_sv = get_sv("Lingua::Stem::Snowball::stemmifier", TRUE);
if ( sv_isobject(stemmifier_sv)
&& sv_derived_from(stemmifier_sv, "Lingua::Stem::Snowball::Stemmifier")
) {
IV tmp = SvIV(SvRV(stemmifier_sv));
stemmifier = INT2PTR(Stemmifier*, tmp);
}
else {
croak("$Lingua::Stem::Snowball::stemmifier isn't a Stemmifier");
}
/* Figure out which sb_stemmer to use. */
sv_ptr = hv_fetch(self_hash, "stemmer_id", 10, 0);
if (!sv_ptr)
croak("Couldn't access stemmer_id");
stemmer_id = SvIV(*sv_ptr);
if ( stemmer_id < 0
|| stemmer_id >= NUMSTEM
|| stemmifier->stemmers[stemmer_id] == NULL
) {
dSP;
ENTER;
SAVETMPS;
PUSHMARK(SP);
XPUSHs(ST(0));
PUTBACK;
call_method("_derive_stemmer", G_DISCARD);
FREETMPS;
LEAVE;
/* Extract what should now be a valid stemmer_id. */
sv_ptr = hv_fetch(self_hash, "stemmer_id", 10, 0);
stemmer_id = SvIV(*sv_ptr);
}
if (stemmer_id != -1) {
struct sb_stemmer *stemmer = stemmifier->stemmers[stemmer_id];
IV i, max;
for (i = 0, max = av_len(words_av); i <= max; i++) {
sv_ptr = av_fetch(words_av, i, 0);
if (SvOK(*sv_ptr)) {
STRLEN len;
sb_symbol *input_text = (sb_symbol*)SvPV(*sv_ptr, len);
const sb_symbol *stemmed_output
= sb_stemmer_stem(stemmer, input_text, (int)len);
len = sb_stemmer_length(stemmer);
sv_setpvn(*sv_ptr, (char*)stemmed_output, len);
}
}
}
}
MODULE = Lingua::Stem::Snowball PACKAGE = Lingua::Stem::Snowball::Stemmifier
SV*
new(class_name)
char* class_name;
CODE:
{
Stemmifier *stemmifier;
New(0, stemmifier, 1, Stemmifier);
Newz(0, stemmifier->stemmers, NUMSTEM, struct sb_stemmer*);
RETVAL = newSV(0);
sv_setref_pv(RETVAL, class_name, (void*)stemmifier);
}
OUTPUT: RETVAL
void
DESTROY(self_sv)
SV *self_sv;
PPCODE:
{
int i;
IV temp = SvIV( SvRV(self_sv) );
Stemmifier *stemmifier = INT2PTR(Stemmifier*, temp);
for (i = 0; i < NUMSTEM; i++) {
if (stemmifier->stemmers[i] != NULL)
sb_stemmer_delete(stemmifier->stemmers[i]);
}
Safefree(stemmifier);
}