Data Structures */
struct swish_3 { int ref_cnt; void *stash; swish_Config *config; swish_Analyzer *analyzer; swish_Parser *parser; };
struct swish_StringList { unsigned int n; unsigned int max; xmlChar** word; };
struct swish_Config { int ref_cnt; void *stash; /* for bindings */ xmlHashTablePtr misc; xmlHashTablePtr properties; xmlHashTablePtr metanames; xmlHashTablePtr tag_aliases; xmlHashTablePtr parsers; xmlHashTablePtr mimes; xmlHashTablePtr index; xmlHashTablePtr stringlists; struct swish_ConfigFlags *flags; /* shortcuts for parsing */ };
struct swish_ConfigFlags { boolean tokenize; boolean cascade_meta_context; boolean ignore_xmlns; boolean follow_xinclude; int undef_metas; int undef_attrs; int max_meta_id; int max_prop_id; xmlHashTablePtr meta_ids; xmlHashTablePtr prop_ids; //xmlHashTablePtr contexts; };
struct swish_NamedBuffer { int ref_cnt; /* for bindings */ void *stash; /* for bindings */ xmlHashTablePtr hash; /* the meat */ };
struct swish_DocInfo { time_t mtime; off_t size; xmlChar * mime; xmlChar * encoding; xmlChar * uri; unsigned int nwords; xmlChar * ext; xmlChar * parser; xmlChar * action; boolean is_gzipped; int ref_cnt; };
struct swish_MetaName { int ref_cnt; int id; xmlChar *name; int bias; xmlChar *alias_for; };
struct swish_Property { int ref_cnt; int id; xmlChar *name; boolean ignore_case; int type; boolean verbatim; xmlChar *alias_for; unsigned int max; boolean sort; boolean presort; unsigned int sort_length; };
struct swish_Token { unsigned int pos; // this token's position in document swish_MetaName *meta; xmlChar *value; xmlChar *context; unsigned int offset; unsigned int len; int ref_cnt; };
struct swish_TokenList { unsigned int n; unsigned int pos; // track position in document xmlHashTablePtr contexts; // cache contexts xmlBufferPtr buf; swish_Token** tokens; int ref_cnt; };
struct swish_TokenIterator { swish_TokenList *tl; swish_Analyzer *a; unsigned int pos; // position in iteration int ref_cnt; };
struct swish_Tag { xmlChar *raw; // tag as libxml2 sees it xmlChar *baked; // tag as libswish3 sees it xmlChar *context; struct swish_Tag *next; unsigned int n; };
struct swish_TagStack { swish_Tag *head; swish_Tag *temp; unsigned int count; char *name; // debugging aid -- name of the stack };
struct swish_Analyzer { unsigned int maxwordlen; // max word length unsigned int minwordlen; // min word length boolean tokenize; // should we parse into TokenList int (*tokenizer) (swish_TokenIterator*, xmlChar*, swish_MetaName*, xmlChar*); xmlChar* (*stemmer) (xmlChar*); boolean lc; // should tokens be lowercased void *stash; // for script bindings void *regex; // optional regex int ref_cnt; // for script bindings };
struct swish_Parser { int ref_cnt; // for script bindings void (*handler)(swish_ParserData*); // handler reference void *stash; // for script bindings int verbosity; };
struct swish_ParserData { swish_3 *s3; // main object xmlBufferPtr meta_buf; // tmp MetaName buffer xmlBufferPtr prop_buf; // tmp Property buffer xmlChar *tag; // current tag name swish_DocInfo *docinfo; // document-specific properties unsigned int ignore_content; // toggle flag. should buffer be indexed. boolean is_html; // shortcut flag for html parser boolean bump_word; // boolean for moving word position/adding space unsigned int offset; // current offset position swish_TagStack *metastack; // stacks for tracking the tag => metaname swish_TagStack *propstack; // stacks for tracking the tag => property swish_TagStack *domstack; // stacks for tracking xml/html dom tree xmlParserCtxtPtr ctxt; // so we can free at end swish_TokenIterator *token_iterator; // token container swish_NamedBuffer *properties; // buffer all properties swish_NamedBuffer *metanames; // buffer all metanames };
/*
Global Functions */ void swish_setup(); const char * swish_lib_version(); const char * swish_libxml2_version(); void swish_setenv(char * name, char * value, int override); /*
Top-Level Functions */ swish_3 * swish_3_init( void (*handler) (swish_ParserData *), void *stash ); void swish_3_free( swish_3 *s3 ); int swish_parse_file( swish_3 * s3, xmlChar *filename ); unsigned int swish_parse_fh( swish_3 * s3, FILE * fh ); int swish_parse_buffer( swish_3 * s3, xmlChar * buf ); unsigned int swish_parse_directory( swish_3 *s3, xmlChar *dir, boolean follow_symlinks ); /*
I/O Functions */ xmlChar * swish_io_slurp_fh( FILE * fh, unsigned long flen, boolean binmode ); xmlChar * swish_io_slurp_file_len( xmlChar *filename, off_t flen, boolean binmode ); xmlChar * swish_io_slurp_gzfile_len( xmlChar *filename, off_t *flen, boolean binmode ); xmlChar * swish_io_slurp_file( xmlChar *filename, off_t flen, boolean is_gzipped, boolean binmode ); long int swish_io_count_operable_file_lines( xmlChar *filename ); boolean swish_io_is_skippable_line( xmlChar *str ); /*
Filesystem Functions */ boolean swish_fs_file_exists( xmlChar *filename ); boolean swish_fs_is_dir( xmlChar *path ); boolean swish_fs_is_file( xmlChar *path ); boolean swish_fs_is_link( xmlChar *path ); off_t swish_fs_get_file_size( xmlChar *path ); time_t swish_fs_get_file_mtime( xmlChar *path ); xmlChar * swish_fs_get_file_ext( xmlChar *url ); xmlChar * swish_fs_get_path( xmlChar *url ); boolean swish_fs_looks_like_gz( xmlChar *file ); /*
Hash Functions */ int swish_hash_add( xmlHashTablePtr hash, xmlChar *key, void * value ); int swish_hash_replace( xmlHashTablePtr hash, xmlChar *key, void *value ); int swish_hash_delete( xmlHashTablePtr hash, xmlChar *key ); boolean swish_hash_exists( xmlHashTablePtr hash, xmlChar *key ); int swish_hash_exists_or_add( xmlHashTablePtr hash, xmlChar *key, xmlChar *value ); void swish_hash_merge( xmlHashTablePtr hash1, xmlHashTablePtr hash2 ); void * swish_hash_fetch( xmlHashTablePtr hash, xmlChar *key ); void swish_hash_dump( xmlHashTablePtr hash, const char *label ); xmlHashTablePtr swish_hash_init(int size); void swish_hash_free( xmlHashTablePtr hash ); /*
Memory Functions */ void swish_mem_init(); void * swish_xrealloc(void *ptr, size_t size); void * swish_xmalloc( size_t size ); void swish_xfree( void *ptr ); void swish_mem_debug(); long int swish_memcount_get(); void swish_memcount_dec(); xmlChar * swish_xstrdup( const xmlChar * ptr ); xmlChar * swish_xstrndup( const xmlChar * ptr, int len ); /*
Time Functions */ double swish_time_elapsed(void); double swish_time_cpu(void); char * swish_time_print(double time); char * swish_time_print_fine(double time); char * swish_time_format(time_t epoch); /*
Error Functions */ void swish_set_error_handle( FILE *where ); void swish_croak(const char *file, int line, const char *func, const char *msg,...); void swish_warn(const char *file, int line, const char *func, const char *msg,...); void swish_debug(const char *file, int line, const char *func, const char *msg,...); const char* swish_err_msg(int err_code); /*
String Functions */ char * swish_get_locale(); void swish_verify_utf8_locale(); boolean swish_is_ascii( xmlChar *str ); int swish_bytes_in_wchar( int wchar ); int swish_utf8_chr_len( xmlChar *utf8 ); uint32_t swish_utf8_codepoint( xmlChar *utf8 ); int swish_utf8_num_chrs( xmlChar *utf8 ); void swish_utf8_next_chr( xmlChar *s, int *i ); void swish_utf8_prev_chr( xmlChar *s, int *i ); xmlChar * swish_str_escape_utf8( xmlChar *utf8 ); xmlChar * swish_str_unescape_utf8( xmlChar *ascii ); wchar_t * swish_locale_to_wchar(xmlChar * str); xmlChar * swish_wchar_to_locale(wchar_t * str); wchar_t * swish_wstr_tolower(wchar_t *s); xmlChar * swish_str_tolower(xmlChar *s ); xmlChar * swish_utf8_str_tolower(xmlChar *s); xmlChar * swish_ascii_str_tolower(xmlChar *s); xmlChar * swish_str_skip_ws(xmlChar *s); void swish_str_trim_ws(xmlChar *string); void swish_str_ctrl_to_ws(xmlChar *s); boolean swish_str_all_ws(xmlChar * s); boolean swish_str_all_ws_len(xmlChar * s, int len); void swish_debug_wchars( const wchar_t * widechars ); int swish_wchar_t_comp(const void *s1, const void *s2); int swish_sort_wchar(wchar_t *s); swish_StringList * swish_stringlist_build(xmlChar *line); swish_StringList * swish_stringlist_init(); void swish_stringlist_free(swish_StringList *sl); unsigned int swish_stringlist_add_string(swish_StringList *sl, xmlChar *str); void swish_stringlist_merge(swish_StringList *sl1, swish_StringList *sl2); swish_StringList * swish_stringlist_copy(swish_StringList *sl); swish_StringList * swish_stringlist_parse_sort_string(xmlChar *sort_string, swish_Config *cfg); void swish_stringlist_debug(swish_StringList *sl); int swish_string_to_int( char *buf ); boolean swish_string_to_boolean( char *buf ); xmlChar * swish_int_to_string( int val ); xmlChar * swish_long_to_string( long val ); xmlChar * swish_double_to_string( double val ); xmlChar * swish_date_to_string( int y, int m, int d ); char swish_get_C_escaped_char(xmlChar *s, xmlChar **se); /*
Configuration Functions */ swish_Config * swish_config_init(); void swish_config_set_default( swish_Config *config ); void swish_config_merge( swish_Config *config1, swish_Config *config2 ); swish_Config * swish_config_add( swish_Config * config, xmlChar * conf ); swish_Config * swish_config_parse( swish_Config * config, xmlChar * conf ); void swish_config_debug( swish_Config * config ); void swish_config_free( swish_Config * config); xmlHashTablePtr swish_mime_defaults(); xmlChar * swish_mime_get_type( swish_Config * config, xmlChar * fileext ); xmlChar * swish_mime_get_parser( swish_Config * config, xmlChar *mime ); void swish_config_test_alias_fors( swish_Config *c ); swish_ConfigFlags * swish_config_flags_init(); void swish_config_flags_debug( swish_ConfigFlags *flags ); void swish_config_flags_free( swish_ConfigFlags *flags ); void swish_config_test_alias_fors( swish_Config *config ); void swish_config_test_unique_ids( swish_Config *config );
/*
Parser Functions */ swish_Parser * swish_parser_init( void (*handler) (swish_ParserData *) ); void swish_parser_free( swish_Parser * parser ); /*
Token Functions */ swish_TokenList * swish_token_list_init(); void swish_token_list_free( swish_TokenList *tl ); int swish_token_list_add_token( swish_TokenList *tl, xmlChar *token, int token_len, swish_MetaName *meta, xmlChar *context ); int swish_token_list_set_token( swish_TokenList *tl, xmlChar *token, int len ); swish_Token * swish_token_init(); void swish_token_free( swish_Token *t ); swish_TokenIterator *swish_token_iterator_init( swish_Analyzer *a ); void swish_token_iterator_free( swish_TokenIterator *ti ); swish_Token * swish_token_iterator_next_token( swish_TokenIterator *it ); int swish_tokenize( swish_TokenIterator *ti, xmlChar *buf, swish_MetaName *meta, xmlChar *context ); int swish_tokenize_ascii( swish_TokenIterator *ti, xmlChar *buf, swish_MetaName *meta, xmlChar *context ); int swish_tokenize_utf8( swish_TokenIterator *ti, xmlChar *buf, swish_MetaName *meta, xmlChar *context ); void swish_token_list_debug( swish_TokenIterator *it ); xmlChar * swish_token_list_get_token_value( swish_TokenList *tl, swish_Token *t ); void swish_token_debug( swish_Token *t );
/*