#include "CLucene/StdHeader.h"
#include "Analyzers.h"
namespace
lucene{
namespace
analysis {
CharTokenizer::CharTokenizer(Reader* in) :
offset(0),
bufferIndex(0),
dataLen(0)
{
input = in;
}
uchar_t CharTokenizer::normalize(
const
uchar_t c)
{
return
c;
}
Token* CharTokenizer::next() {
int_t length = 0;
#ifdef UTF8
int_t start = input->position();
while
(
true
) {
uchar_t c;
offset++;
c = input->readChar();
if
(!c)
{
if
(length > 0)
break
;
else
return
NULL;
}
if
(isTokenChar(c)) {
buffer[length++] = normalize(c);
if
(length == MAX_WORD_LEN)
throw
"token too large for the buffer"
;
}
else
{
if
(length > 0)
break
;
}
}
buffer[length]=0;
char
*tokbuf = lc_ucs4_to_utf8(buffer, length);
offset = input->position();
Token *retval =
new
Token( tokbuf, start, offset);
delete
[] tokbuf;
return
retval;
#else
int_t start = offset;
while
(
true
) {
uchar_t c;
offset++;
if
(bufferIndex >= dataLen) {
dataLen = input->read(ioBuffer,0,IO_BUFFER_SIZE);
bufferIndex = 0;
};
if
(dataLen <= 0 ) {
if
(length > 0)
break
;
else
return
NULL;
}
else
c = ioBuffer[bufferIndex++];
if
(isTokenChar(c)) {
if
(length == 0)
start = offset-1;
buffer[length++] = normalize(c);
if
(length == MAX_WORD_LEN)
break
;
}
else
if
(length > 0)
break
;
}
buffer[length]=0;
return
new
Token( buffer, start, start+length);
#endif
}
bool
LetterTokenizer::isTokenChar(
const
uchar_t c) {
return
Misc::isLetter(c)!=0;
}
uchar_t LowerCaseTokenizer::normalize(
const
uchar_t chr) {
return
toLower(chr);
}
bool
WhitespaceTokenizer::isTokenChar(
const
uchar_t c) {
return
isSpace(c)==0;
}
TokenStream& WhitespaceAnalyzer::tokenStream(
const
char_t* fieldName, Reader* reader) {
return
*
new
WhitespaceTokenizer(reader);
}
TokenStream& SimpleAnalyzer::tokenStream(
const
char_t* fieldName, Reader* reader) {
return
*
new
LowerCaseTokenizer(reader);
}
Token* LowerCaseFilter::next(){
Token* t = input->next();
if
(t == NULL)
return
NULL;
#ifdef UTF8
char
*tmpstr = lc_utf8_strcasefold( t->termText );
delete
[] t->termText;
t->termText = tmpstr;
#else
stringLower( t->termText );
#endif
return
t;
}
StopFilter::StopFilter(TokenStream* in,
bool
deleteTokenStream, char_t* stopWords[], int_t stopWordsLength):
TokenFilter(in, deleteTokenStream),
table(*
new
VoidMap< char_t*, char_t*>)
{
fillStopTable( table,stopWords,stopWordsLength );
}
void
StopFilter::fillStopTable(VoidMap< char_t*, char_t*>& stopTable,
char_t* stopWords[],
int_t length) {
for
(int_t i = 0; i < length; i++)
stopTable.put(stopWords[i],stopWords[i]);
}
Token* StopFilter::next() {
for
(Token* token = input->next(); token != NULL; token = input->next()){
if
(!table.exists(token->termText)){
return
token;
}
else
{
delete
token;
}
}
return
NULL;
}
StopAnalyzer::StopAnalyzer()
{
StopFilter::fillStopTable(stopTable,ENGLISH_STOP_WORDS,ENGLISH_STOP_WORDS_LENGTH);
}
StopAnalyzer::StopAnalyzer( char_t* stopWords[], int_t length) {
StopFilter::fillStopTable(stopTable,stopWords,length);
}
TokenStream& StopAnalyzer::tokenStream(
const
char_t* fieldName, Reader* reader) {
return
*
new
StopFilter(
new
LowerCaseTokenizer(reader),
true
, stopTable);
}
char_t* StopAnalyzer::ENGLISH_STOP_WORDS [] =
{
_T(
"a"
), _T(
"and"
), _T(
"are"
), _T(
"as"
), _T(
"at"
), _T(
"be"
), _T(
"but"
), _T(
"by"
),
_T(
"for"
), _T(
"if"
), _T(
"in"
), _T(
"into"
), _T(
"is"
), _T(
"it"
),
_T(
"no"
), _T(
"not"
), _T(
"of"
), _T(
"on"
), _T(
"or"
), _T(
"s"
), _T(
"such"
),
_T(
"t"
), _T(
"that"
), _T(
"the"
), _T(
"their"
), _T(
"then"
), _T(
"there"
), _T(
"these"
),
_T(
"they"
), _T(
"this"
), _T(
"to"
), _T(
"was"
), _T(
"will"
), _T(
"with"
)
};
}}