Last active
December 16, 2024 07:52
-
-
Save chenkovsky/40a5e6078fe7857ba3a2c1b2c624b4b6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #ifndef __LANCE_TOKENIZER_H__ | |
| #define __LANCE_TOKENIZER_H__ 1 | |
| #include <stddef.h> | |
| #include <stdint.h> | |
| typedef struct LanceTokenizerFactoryT* LanceTokenizerFactory; | |
| typedef struct LanceTokenizerT* LanceTokenizer; | |
| typedef struct LanceTokenStreamT* LanceTokenStream; | |
| struct LanceTokenT { | |
| size_t offset_from; | |
| /// Offset (byte index) of the last character of the token + 1. | |
| /// The text that generated the token should be obtained by | |
| /// &text[token.offset_from..token.offset_to] | |
| size_t offset_to; | |
| /// Position, expressed in number of tokens. | |
| size_t position; | |
| /// Actual text content of the token. | |
| const char* text; | |
| /// Is the length expressed in term of number of original tokens. | |
| size_t position_length; | |
| }; | |
| typedef struct LanceTokenT* LanceToken; | |
| struct LanceTokenizerPluginT { | |
| // lance will call this when load dynamic library | |
| int (*create_factory)(int argc, const char * const* argv, LanceTokenizerFactory *factory_ptr); | |
| // lance will call this when unload dynamic library | |
| void (*destroy_factory)(LanceTokenizerFactory *factory_ptr); | |
| // create tokenizer | |
| int (*create_tokenizer)(LanceTokenizerFactory factory, int argc, const char * const* argv, LanceTokenizer *tokenizer_ptr); | |
| // destroy tokenizer | |
| void (*destroy_tokenizer)(LanceTokenizer *tokenizer_ptr); | |
| // create token stream | |
| int (*create_stream)(const LanceTokenizer tokenizer, const char* text, size_t length, LanceTokenStream *stream_ptr); | |
| // destroy token stream | |
| void (*destroy_stream)(LanceTokenStream *stream_ptr); | |
| // if there's token, return 1, and update token. otherwise return 0. | |
| int (*iter_token)(LanceTokenStream stream, LanceToken token); | |
| const char* (*strerror)(int errnum); | |
| }; | |
| typedef struct LanceTokenizerPluginT* LanceTokenizerPlugin; | |
| // return 0 if init succeeded. | |
| typedef int(*plugin_metadata)(LanceTokenizerPlugin plugin); | |
| #endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
user can configure plugin with yaml.