Last active
December 16, 2024 07:52
-
-
Save chenkovsky/40a5e6078fe7857ba3a2c1b2c624b4b6 to your computer and use it in GitHub Desktop.
Revisions
-
chenkovsky revised this gist
Dec 16, 2024 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -26,8 +26,6 @@ struct LanceTokenT { typedef struct LanceTokenT* LanceToken; struct LanceTokenizerPluginT { // lance will call this when load dynamic library -
chenkovsky created this gist
Dec 15, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,62 @@ #ifndef __LANCE_TOKENIZER_H__ #define __LANCE_TOKENIZER_H__ 1 #include <stddef.h> #include <stdint.h> typedef struct LanceTokenizerFactoryT* LanceTokenizerFactory; typedef struct LanceTokenizerT* LanceTokenizer; typedef struct LanceTokenStreamT* LanceTokenStream; struct LanceTokenT { size_t offset_from; /// Offset (byte index) of the last character of the token + 1. /// The text that generated the token should be obtained by /// &text[token.offset_from..token.offset_to] size_t offset_to; /// Position, expressed in number of tokens. size_t position; /// Actual text content of the token. const char* text; /// Is the length expressed in term of number of original tokens. size_t position_length; }; typedef struct LanceTokenT* LanceToken; typedef struct LanceTokenizerErrorT* LanceTokenizerError; struct LanceTokenizerPluginT { // lance will call this when load dynamic library int (*create_factory)(int argc, const char * const* argv, LanceTokenizerFactory *factory_ptr); // lance will call this when unload dynamic library void (*destroy_factory)(LanceTokenizerFactory *factory_ptr); // create tokenizer int (*create_tokenizer)(LanceTokenizerFactory factory, int argc, const char * const* argv, LanceTokenizer *tokenizer_ptr); // destroy tokenizer void (*destroy_tokenizer)(LanceTokenizer *tokenizer_ptr); // create token stream int (*create_stream)(const LanceTokenizer tokenizer, const char* text, size_t length, LanceTokenStream *stream_ptr); // destroy token stream void (*destroy_stream)(LanceTokenStream *stream_ptr); // if there's token, return 1, and update token. otherwise return 0. int (*iter_token)(LanceTokenStream stream, LanceToken token); const char* (*strerror)(int errnum); }; typedef struct LanceTokenizerPluginT* LanceTokenizerPlugin; // return 0 if init succeeded. typedef int(*plugin_metadata)(LanceTokenizerPlugin plugin); #endif