Skip to content

Instantly share code, notes, and snippets.

@beginor
Created August 15, 2024 07:34
Show Gist options
  • Save beginor/fc284c3fc14044cb1a52d3b1ad9d4cb1 to your computer and use it in GitHub Desktop.
Save beginor/fc284c3fc14044cb1a52d3b1ad9d4cb1 to your computer and use it in GitHub Desktop.

Revisions

  1. @hanxiao hanxiao created this gist Aug 14, 2024.
    85 changes: 85 additions & 0 deletions chunking-regex.ts
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,85 @@
    // Used in https://jina.ai/tokenizer (Aug. 14th version)
    // Define variables for magic numbers
    const MAX_HEADING_LENGTH = 6;
    const MAX_HEADING_CONTENT_LENGTH = 200;
    const MAX_HEADING_UNDERLINE_LENGTH = 200;
    const MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100;
    const MAX_LIST_ITEM_LENGTH = 200;
    const MAX_NESTED_LIST_ITEMS = 5;
    const MAX_LIST_INDENT_SPACES = 7;
    const MAX_BLOCKQUOTE_LINE_LENGTH = 200;
    const MAX_BLOCKQUOTE_LINES = 10;
    const MAX_CODE_BLOCK_LENGTH = 1000;
    const MAX_CODE_LANGUAGE_LENGTH = 20;
    const MAX_INDENTED_CODE_LINES = 20;
    const MAX_TABLE_CELL_LENGTH = 200;
    const MAX_TABLE_ROWS = 20;
    const MAX_HTML_TABLE_LENGTH = 2000;
    const MIN_HORIZONTAL_RULE_LENGTH = 3;
    const MAX_SENTENCE_LENGTH = 300;
    const MAX_QUOTED_TEXT_LENGTH = 300;
    const MAX_PARENTHETICAL_CONTENT_LENGTH = 200;
    const MAX_NESTED_PARENTHESES = 5;
    const MAX_MATH_INLINE_LENGTH = 100;
    const MAX_MATH_BLOCK_LENGTH = 500;
    const MAX_PARAGRAPH_LENGTH = 1000;
    const MAX_STANDALONE_LINE_LENGTH = 400;
    const MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100;
    const MAX_HTML_TAG_CONTENT_LENGTH = 1000;

    // Read the regex and test text from files
    export const chunkRegex = new RegExp(
    "(" +
    // 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)
    `(?:^(?:[#*=-]{1,${MAX_HEADING_LENGTH}}|\\w[^\\r\\n]{0,${MAX_HEADING_CONTENT_LENGTH}}\\r?\\n[-=]{2,${MAX_HEADING_UNDERLINE_LENGTH}}|<h[1-6][^>]{0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}>)[^\\r\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:</h[1-6]>)?(?:\\r?\\n|$))` +
    "|" +
    // 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints)
    `(?:(?:^|\\r?\\n)[ \\t]{0,3}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}` +
    `(?:(?:\\r?\\n[ \\t]{2,5}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_NESTED_LIST_ITEMS}}` +
    `(?:\\r?\\n[ \\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+•]|\\d{1,3}\\.\\w\\.|\\[[ xX]\\])[ \\t]+[^\\r\\n]{1,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_NESTED_LIST_ITEMS}})?)` +
    "|" +
    // 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints)
    `(?:(?:^>(?:>|\\s{2,}){0,2}[^\\r\\n]{0,${MAX_BLOCKQUOTE_LINE_LENGTH}}\\r?\\n?){1,${MAX_BLOCKQUOTE_LINES}})` +
    "|" +
    // 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints)
    `(?:(?:^|\\r?\\n)(?:\`\`\`|~~~)(?:\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\r?\\n[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\`\`\`|~~~)\\r?\\n?` +
    `|(?:(?:^|\\r?\\n)(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\r?\\n(?: {4}|\\t)[^\\r\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_INDENTED_CODE_LINES}}\\r?\\n?)` +
    `|(?:<pre>(?:<code>)?[\\s\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:</code>)?</pre>))` +
    "|" +
    // 5. Tables (Markdown, grid tables, and HTML tables, with length constraints)
    `(?:(?:^|\\r?\\n)(?:\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|(?:\\r?\\n\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\|){0,1}(?:\\r?\\n\\|[^\\r\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\|){0,${MAX_TABLE_ROWS}}` +
    `|<table>[\\s\\S]{0,${MAX_HTML_TABLE_LENGTH}}?</table>))` +
    "|" +
    // 6. Horizontal rules (Markdown and HTML hr tag)
    `(?:^(?:[-*_]){${MIN_HORIZONTAL_RULE_LENGTH},}\\s*$|<hr\\s*/?>)` +
    "|" +
    // 10. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints)
    `(?:^(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}>)?[^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?:</[a-zA-Z]+>)?(?:\\r?\\n|$))` +
    "|" +
    // 7. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation)
    `(?:[^\\r\\n]{1,${MAX_SENTENCE_LENGTH}}(?:[.!?…]|\\.{3}|[\\u2026\\u2047-\\u2049]|[\\p{Emoji_Presentation}\\p{Extended_Pictographic}])(?=(?=\\s)(?=\\s*[A-Z])|(?=\\r?\\n)|(?=\\Z)))` +
    "|" +
    // 8. Quoted text, parenthetical phrases, or bracketed content (with length constraints)
    "(?:" +
    `\"\"\"[^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}\"\"\"` +
    `|['\"\`'"][^\\r\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}['\"\`'"]` +
    `|\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\([^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\)[^\\r\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\)` +
    `|\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\[[^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\][^\\r\\n\\[\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\]` +
    `|\\$[^\\r\\n$]{0,${MAX_MATH_INLINE_LENGTH}}\\$` +
    `|\`[^\`\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\`` +
    ")" +
    "|" +
    // 9. Paragraphs (with length constraints)
    `(?:(?<=\\r?\\n\\r?\\n|\\A)(?:<p>)?(?:(?!\\r?\\n\\r?\\n|\\z).){1,${MAX_PARAGRAPH_LENGTH}}(?:</p>)?(?=\\r?\\n\\r?\\n|\\z))` +
    "|" +
    // 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints)
    `(?:<[a-zA-Z][^>]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}(?:>[\\s\\S]{0,${MAX_HTML_TAG_CONTENT_LENGTH}}?</[a-zA-Z]+>|\\s*/>))` +
    "|" +
    // 12. LaTeX-style math expressions (inline and block, with length constraints)
    `(?:(?:\\$\\$[\\s\\S]{0,${MAX_MATH_BLOCK_LENGTH}}?\\$\\$)|(?:\\$[^\\$\\r\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\$))` +
    "|" +
    // 14. Fallback for any remaining content (with length constraints)
    `(?:[^\\r\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` +
    ")",
    "gm"
    );