Created
August 26, 2024 09:12
-
-
Save LukasKriesch/e75a0132e93ca989f8870c4f95be734b to your computer and use it in GitHub Desktop.
Revisions
-
LukasKriesch created this gist
Aug 26, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,120 @@ import regex as re import requests MAX_HEADING_LENGTH = 7 MAX_HEADING_CONTENT_LENGTH = 200 MAX_HEADING_UNDERLINE_LENGTH = 200 MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100 MAX_LIST_ITEM_LENGTH = 200 MAX_NESTED_LIST_ITEMS = 6 MAX_LIST_INDENT_SPACES = 7 MAX_BLOCKQUOTE_LINE_LENGTH = 200 MAX_BLOCKQUOTE_LINES = 15 MAX_CODE_BLOCK_LENGTH = 1500 MAX_CODE_LANGUAGE_LENGTH = 20 MAX_INDENTED_CODE_LINES = 20 MAX_TABLE_CELL_LENGTH = 200 MAX_TABLE_ROWS = 20 MAX_HTML_TABLE_LENGTH = 2000 MIN_HORIZONTAL_RULE_LENGTH = 3 MAX_SENTENCE_LENGTH = 400 MAX_QUOTED_TEXT_LENGTH = 300 MAX_PARENTHETICAL_CONTENT_LENGTH = 200 MAX_NESTED_PARENTHESES = 5 MAX_MATH_INLINE_LENGTH = 100 MAX_MATH_BLOCK_LENGTH = 500 MAX_PARAGRAPH_LENGTH = 1000 MAX_STANDALONE_LINE_LENGTH = 800 MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100 MAX_HTML_TAG_CONTENT_LENGTH = 1000 LOOKAHEAD_RANGE = 100 # Step 1: Download the file url = "https://gist.github.com/phillipj/4944029/raw/75ba2243dd5ec2875f629bf5d79f6c1e4b5a8b46/alice_in_wonderland.txt" response = requests.get(url) if response.status_code == 200: test_text = response.text else: raise Exception("Failed to download the file.") # Step 2: Apply the regex chunk_regex = re.compile( r"(" + # 1. Headings (Setext-style, Markdown, and HTML-style) rf"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\r\n]{{1,{MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\r?\n|$))" + "|" + # 2. Citations rf"(?:\[[0-9]+\][^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}})" + "|" + # 3. List items (Adjusted to handle indentation correctly) rf"(?:(?:^|\r?\n)[ \t]{{0,3}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}})(?:\r?\n[ \t]{{2,}}(?:[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}))*)" + "|" + # 4. Block quotes (Handles nested quotes without chunking) rf"(?:(?:^>(?:>|\\s{{2,}}){{0,2}}(?:[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}})(?:\r?\n[ \t]+[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}})*?\r?\n?))" + "|" + # 5. Code blocks rf"(?:(?:^|\r?\n)(?:```|~~~)(?:\w{{0,{MAX_CODE_LANGUAGE_LENGTH}}})?\r?\n[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:```|~~~)\r?\n?)" + rf"|(?:(?:^|\r?\n)(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}(?:\r?\n(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}){{0,{MAX_INDENTED_CODE_LINES}}}\r?\n?)" + rf"|(?:<pre>(?:<code>)[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:</code>)?</pre>)" + "|" + # 6. Tables rf"(?:(?:^|\r?\n)\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\|)?(?:\r?\n\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|){{0,{MAX_TABLE_ROWS}}})" + rf"|<table>[\s\S]{{0,{MAX_HTML_TABLE_LENGTH}}}?</table>" + "|" + # 7. Horizontal rules rf"(?:^(?:[-*_]){{{MIN_HORIZONTAL_RULE_LENGTH},}}\s*$|<hr\s*/?>)" + "|" + # 8. Standalone lines or phrases (Prevent chunking by treating indented lines as part of the same block) rf"(?:^(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}>[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}})?(?:</[a-zA-Z]+>)?(?:\r?\n|$))" + rf"(?:\r?\n[ \t]+[^\r\n]*)*)" + "|" + # 9. Sentences (Allow sentences to include multiple lines if they are indented) rf"(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}})?(?=\s|$)(?:\r?\n[ \t]+[^\r\n]*)*)" + "|" + # 10. Quoted text, parentheticals, or bracketed content rf"(?<!\w)\"\"\"[^\"]{{0,{MAX_QUOTED_TEXT_LENGTH}}}\"\"\"(?!\w)" + rf"|(?<!\w)(?:['\"\`])[^\r\n]{{0,{MAX_QUOTED_TEXT_LENGTH}}}\g<1>(?!\w)" + rf"|\([^\r\n()]{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\([^\r\n()]{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}\)[^\r\n()]{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}){{0,{MAX_NESTED_PARENTHESES}}}\)" + rf"|\[[^\r\n\[\]]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\[[^\r\n\[\]]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\][^\r\n\[\]]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\]" + rf"|\$[^\r\n$]{{0,{MAX_MATH_INLINE_LENGTH}}}\$" + rf"|`[^\r\n`]{{0,{MAX_MATH_INLINE_LENGTH}}}`" + "|" + # 11. Paragraphs (Treats indented lines as part of the same paragraph) rf"(?:(?:^|\r?\n\r?\n)(?:<p>)?(?:(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}})?(?=\s|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:</p>)?(?:\r?\n[ \t]+[^\r\n]*)*)" + "|" + # 12. HTML-like tags and their content rf"(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{MAX_HTML_TAG_CONTENT_LENGTH}}}</[a-zA-Z]+>|\s*/>))" + "|" + # 13. LaTeX-style math expressions rf"(?:(?:\$\$[\s\S]{{0,{MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}\$))" + "|" + # 14. Fallback for any remaining content (Keep content together if it's indented) rf"(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}})?(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))(?:\r?\n[ \t]+[^\r\n]*)?))" + r")", re.MULTILINE | re.UNICODE ) # Step 3: Apply the regex and print the matches matches = chunk_regex.findall(test_text) matches=[m[0] for m in matches] print(f"Number of chunks: {len(matches)}") url = 'https://tokenize.jina.ai/' headers = { 'Content-Type': 'application/json', } data = { 'content':test_text, 'return_chunks': 'true' } response = requests.post(url, headers=headers, json=data) chunks=response.json()["chunks"] print(f"Number of chunks: {len(chunks)}")