#ifndef __LANCE_TOKENIZER_H__ #define __LANCE_TOKENIZER_H__ 1 #include #include typedef struct LanceTokenizerFactoryT* LanceTokenizerFactory; typedef struct LanceTokenizerT* LanceTokenizer; typedef struct LanceTokenStreamT* LanceTokenStream; struct LanceTokenT { size_t offset_from; /// Offset (byte index) of the last character of the token + 1. /// The text that generated the token should be obtained by /// &text[token.offset_from..token.offset_to] size_t offset_to; /// Position, expressed in number of tokens. size_t position; /// Actual text content of the token. const char* text; /// Is the length expressed in term of number of original tokens. size_t position_length; }; typedef struct LanceTokenT* LanceToken; struct LanceTokenizerPluginT { // lance will call this when load dynamic library int (*create_factory)(int argc, const char * const* argv, LanceTokenizerFactory *factory_ptr); // lance will call this when unload dynamic library void (*destroy_factory)(LanceTokenizerFactory *factory_ptr); // create tokenizer int (*create_tokenizer)(LanceTokenizerFactory factory, int argc, const char * const* argv, LanceTokenizer *tokenizer_ptr); // destroy tokenizer void (*destroy_tokenizer)(LanceTokenizer *tokenizer_ptr); // create token stream int (*create_stream)(const LanceTokenizer tokenizer, const char* text, size_t length, LanceTokenStream *stream_ptr); // destroy token stream void (*destroy_stream)(LanceTokenStream *stream_ptr); // if there's token, return 1, and update token. otherwise return 0. int (*iter_token)(LanceTokenStream stream, LanceToken token); const char* (*strerror)(int errnum); }; typedef struct LanceTokenizerPluginT* LanceTokenizerPlugin; // return 0 if init succeeded. typedef int(*plugin_metadata)(LanceTokenizerPlugin plugin); #endif