Skip to content

Instantly share code, notes, and snippets.

@chenkovsky
Last active December 16, 2024 07:52
Show Gist options
  • Save chenkovsky/40a5e6078fe7857ba3a2c1b2c624b4b6 to your computer and use it in GitHub Desktop.
Save chenkovsky/40a5e6078fe7857ba3a2c1b2c624b4b6 to your computer and use it in GitHub Desktop.

Revisions

  1. chenkovsky revised this gist Dec 16, 2024. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions lance_tokenizer_plugin_draft.h
    Original file line number Diff line number Diff line change
    @@ -26,8 +26,6 @@ struct LanceTokenT {

    typedef struct LanceTokenT* LanceToken;

    typedef struct LanceTokenizerErrorT* LanceTokenizerError;

    struct LanceTokenizerPluginT {

    // lance will call this when load dynamic library
  2. chenkovsky created this gist Dec 15, 2024.
    62 changes: 62 additions & 0 deletions lance_tokenizer_plugin_draft.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,62 @@
    #ifndef __LANCE_TOKENIZER_H__
    #define __LANCE_TOKENIZER_H__ 1
    #include <stddef.h>
    #include <stdint.h>

    typedef struct LanceTokenizerFactoryT* LanceTokenizerFactory;
    typedef struct LanceTokenizerT* LanceTokenizer;
    typedef struct LanceTokenStreamT* LanceTokenStream;

    struct LanceTokenT {
    size_t offset_from;
    /// Offset (byte index) of the last character of the token + 1.
    /// The text that generated the token should be obtained by
    /// &text[token.offset_from..token.offset_to]
    size_t offset_to;

    /// Position, expressed in number of tokens.
    size_t position;

    /// Actual text content of the token.
    const char* text;

    /// Is the length expressed in term of number of original tokens.
    size_t position_length;
    };

    typedef struct LanceTokenT* LanceToken;

    typedef struct LanceTokenizerErrorT* LanceTokenizerError;

    struct LanceTokenizerPluginT {

    // lance will call this when load dynamic library
    int (*create_factory)(int argc, const char * const* argv, LanceTokenizerFactory *factory_ptr);

    // lance will call this when unload dynamic library
    void (*destroy_factory)(LanceTokenizerFactory *factory_ptr);

    // create tokenizer
    int (*create_tokenizer)(LanceTokenizerFactory factory, int argc, const char * const* argv, LanceTokenizer *tokenizer_ptr);

    // destroy tokenizer
    void (*destroy_tokenizer)(LanceTokenizer *tokenizer_ptr);

    // create token stream
    int (*create_stream)(const LanceTokenizer tokenizer, const char* text, size_t length, LanceTokenStream *stream_ptr);

    // destroy token stream
    void (*destroy_stream)(LanceTokenStream *stream_ptr);

    // if there's token, return 1, and update token. otherwise return 0.
    int (*iter_token)(LanceTokenStream stream, LanceToken token);

    const char* (*strerror)(int errnum);
    };

    typedef struct LanceTokenizerPluginT* LanceTokenizerPlugin;

    // return 0 if init succeeded.
    typedef int(*plugin_metadata)(LanceTokenizerPlugin plugin);

    #endif