Skip to content

Instantly share code, notes, and snippets.

@chenkovsky
Last active December 16, 2024 07:52
Show Gist options
  • Save chenkovsky/40a5e6078fe7857ba3a2c1b2c624b4b6 to your computer and use it in GitHub Desktop.
Save chenkovsky/40a5e6078fe7857ba3a2c1b2c624b4b6 to your computer and use it in GitHub Desktop.
#ifndef __LANCE_TOKENIZER_H__
#define __LANCE_TOKENIZER_H__ 1
#include <stddef.h>
#include <stdint.h>
typedef struct LanceTokenizerFactoryT* LanceTokenizerFactory;
typedef struct LanceTokenizerT* LanceTokenizer;
typedef struct LanceTokenStreamT* LanceTokenStream;
struct LanceTokenT {
size_t offset_from;
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
size_t offset_to;
/// Position, expressed in number of tokens.
size_t position;
/// Actual text content of the token.
const char* text;
/// Is the length expressed in term of number of original tokens.
size_t position_length;
};
typedef struct LanceTokenT* LanceToken;
struct LanceTokenizerPluginT {
// lance will call this when load dynamic library
int (*create_factory)(int argc, const char * const* argv, LanceTokenizerFactory *factory_ptr);
// lance will call this when unload dynamic library
void (*destroy_factory)(LanceTokenizerFactory *factory_ptr);
// create tokenizer
int (*create_tokenizer)(LanceTokenizerFactory factory, int argc, const char * const* argv, LanceTokenizer *tokenizer_ptr);
// destroy tokenizer
void (*destroy_tokenizer)(LanceTokenizer *tokenizer_ptr);
// create token stream
int (*create_stream)(const LanceTokenizer tokenizer, const char* text, size_t length, LanceTokenStream *stream_ptr);
// destroy token stream
void (*destroy_stream)(LanceTokenStream *stream_ptr);
// if there's token, return 1, and update token. otherwise return 0.
int (*iter_token)(LanceTokenStream stream, LanceToken token);
const char* (*strerror)(int errnum);
};
typedef struct LanceTokenizerPluginT* LanceTokenizerPlugin;
// return 0 if init succeeded.
typedef int(*plugin_metadata)(LanceTokenizerPlugin plugin);
#endif
@chenkovsky
Copy link
Author

user can configure plugin with yaml.

plugins:
   lindera:
       library:    path_to_so
       metadata: plugin_metadata_func_name
       arguments:
       - extra arguments to factory

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment