Skip to content

Instantly share code, notes, and snippets.

@rennokki
Last active October 10, 2023 07:52
Show Gist options
  • Save rennokki/bde0f4576eca82c20d04b107dd5c67b4 to your computer and use it in GitHub Desktop.
Save rennokki/bde0f4576eca82c20d04b107dd5c67b4 to your computer and use it in GitHub Desktop.

Revisions

  1. rennokki revised this gist Oct 10, 2023. 1 changed file with 10 additions and 6 deletions.
    16 changes: 10 additions & 6 deletions code.ts
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@
    // Create a smol embedding; CloudflareWorkersAIEmbeddings is provided by langchain
    function setupEmbeddings(env: Env): CloudflareWorkersAIEmbeddings {
    return new CloudflareWorkersAIEmbeddings({
    binding: env.AI as unknown as Fetcher,
    @@ -9,25 +10,28 @@ function setupEmbeddings(env: Env): CloudflareWorkersAIEmbeddings {
    });
    };

    // CloudflareVectorizeStore is provided by langchain
    function vectorStorage(env: Env): CloudflareVectorizeStore {
    return new CloudflareVectorizeStore(setupEmbeddings(env), {
    index: env.V1_VECTORIZE,
    index: env.V1_VECTORIZE, // index
    onFailedAttempt: (error) => {
    console.log('Failed attempt on vectorize', error);
    },
    });
    };


    // Just a wiki article, tried to find smth small
    const content = 'https://en.wikipedia.org/wiki/Attiki,_Athens';

    // Get the Cheerio Web Loader and split docs
    const rawDocs = await new CheerioWebBaseLoader(content as string).load(); // tried loadAndSplit() too

    // HTML -> Docs splitter
    const splitter = RecursiveCharacterTextSplitter.fromLanguage('html');
    const sequence = splitter.pipe(new HtmlToTextTransformer());

    // Make the sequence pass through HtmlToTextTransformer() to get the final docs
    const documents = await sequence.invoke(rawDocs);

    // This triggers the above error.
    // This call triggers the error.
    await vectorStorage(env).addDocuments(documents);



  2. rennokki revised this gist Oct 10, 2023. 2 changed files with 3 additions and 2 deletions.
    2 changes: 2 additions & 0 deletions code.ts
    Original file line number Diff line number Diff line change
    @@ -18,6 +18,8 @@ function vectorStorage(env: Env): CloudflareVectorizeStore {
    });
    };


    const content = 'https://en.wikipedia.org/wiki/Attiki,_Athens';
    const rawDocs = await new CheerioWebBaseLoader(content as string).load(); // tried loadAndSplit() too
    const splitter = RecursiveCharacterTextSplitter.fromLanguage('html');
    const sequence = splitter.pipe(new HtmlToTextTransformer());
    3 changes: 1 addition & 2 deletions log
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,6 @@
    Script modified; context reset.
    Script modified; context reset.
    Array(23)0: Documentmetadata: contentHash: "bdb6bb8a1c8be21da7d69ec6772d5ae45a86957d621783f0ba6e1e6d20fcc1ef"loc: lines: {from: 1, to: 185}[[Prototype]]: ObjectoriginHash: "d4a39bfd6009e583ebde0b29f88490ac9916e4dd998a15d4a3c947e52ca7ea7e"source: "https://en.wikipedia.org/wiki/Attiki,_Athens"vectorId: undefined[[Prototype]]: ObjectpageContent: "Jump to content Main menu Main menu move to sidebar hide Navigation Main\npageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\nContribute HelpLearn to editCommunity portalRecent changesUpload file Languages\nLanguage links are at the top of the page across from the title. Search Search\nCreate accountLog in Personal tools Create account Log in Pages for logged out\neditors learn more ContributionsTalk Contents move to"[[Prototype]]: Object1: Document {pageContent: 'for logged out editors learn more ContributionsTal…s Tools move to sidebar\nhide Actions ReadEditView', metadata: {…}}2: Document {pageContent: 'history Tools Tools move to sidebar hide Actions R…rom Wikipedia, the free encyclopedia Neighborhood', metadata: {…}}3: Document {pageContent: '.latitude{white-space:nowrap}37°59′47″N 23°43′21″E…parser-output\n.ib-settlement td,.mw-parser-output', metadata: {…}}4: Document {pageContent: '.mw-parser-output .infobox-below{text-align:center….mw-parser-output\n.ib-settlement .mergedbottomrow', metadata: {…}}5: Document {pageContent: '.ib-settlement .mergedrow .infobox-label{border:0;…ent-caption{padding:0.3em 0 0 0}.mw-parser-output', metadata: {…}}6: Document {pageContent: '.ib-settlement-other-name{font-size:78%}.mw-parser…small{font-size:85%}.mw-parser-output .references', metadata: {…}}7: Document {pageContent: '23.72250CountryGreeceRegionAtticaCityAthensPostal …60s and 1970s. References[edit] .mw-parser-output', metadata: {…}}8: Document {pageContent: 'to its proximity to the Cephissus river. Nowadays,…eflist-lower-roman{list-style-type:lower-roman} ^', metadata: {…}}9: Document {pageContent: '.reflist-lower-alpha{list-style-type:lower-alpha}.…mw-parser-output .citation .cs1-lock-subscription', metadata: {…}}10: Document {pageContent: '0.1em center/9px no-repeat}.mw-parser-output .id-l…αν! Το περίφημο "Θηρίο" ένωνε Κηφισιά με Λαύριο".', metadata: {…}}11: Document {pageContent: '.citation .mw-selflink{font-weight:inherit}"Η Αττι…ight:1.5em;border-color:#fdfdfd}.mw-parser-output', metadata: {…}}metadata: {source: 'https://en.wikipedia.org/wiki/Attiki,_Athens', loc: {…}, contentHash: 'c039524faded387c8cdd1f9da3fd048fec0dd548a492b1e1a3d928f76e803145', originHash: 'd4a39bfd6009e583ebde0b29f88490ac9916e4dd998a15d4a3c947e52ca7ea7e', vectorId: undefined}pageContent: ".citation .mw-selflink{font-weight:inherit}\"Η Αττική είχε προαστιακό σιδηρόδρομο\nπριν από 129 χρόνια, αλλά τον ξήλωσαν! Το περίφημο \"Θηρίο\" ένωνε Κηφισιά με\nΛαύριο\". mixanitouxronou.gr. Retrieved 4 April 2015. .mw-parser-output\n.navbox{box-sizing:border-box;border:1px solid\n#a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em\nauto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output\n.navbox+.navbox,.mw-parser-output\n.navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output\n.navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output\n.navbox-group,.mw-parser-output .navbox-title,.mw-parser-output\n.navbox-abovebelow{padding:0.25em\n1em;line-height:1.5em;text-align:center}.mw-parser-output\n.navbox-group{white-space:nowrap;text-align:right}.mw-parser-output\n.navbox,.mw-parser-output\n.navbox-subgroup{background-color:#fdfdfd}.mw-parser-output\n.navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output"[[Prototype]]: Object12: Document {pageContent: '.navbox,.mw-parser-output\n.navbox-subgroup{backgro…ser-output .navbox .hlist td\nul,.mw-parser-output', metadata: {…}}13: Document {pageContent: '.navbox-odd{background-color:transparent}.mw-parse…r-output .hlist dl ol,.mw-parser-output .hlist dl', metadata: {…}}14: Document {pageContent: '.hlist.inline dl,.mw-parser-output .hlist.inline o…d:first-child::before,.mw-parser-output .hlist dt', metadata: {…}}15: Document {pageContent: '.hlist dd dt:first-child::before,.mw-parser-output…l{counter-reset:listitem}.mw-parser-output .hlist', metadata: {…}}16: Document {pageContent: '.hlist li dt:last-child::after,.mw-parser-output .…i{word-spacing:-0.125em}.mw-parser-output .navbar', metadata: {…}}17: Document {pageContent: '"}.mw-parser-output .navbar-brackets::after{margin… (Aerides,\nAnafiotika) Polygono Probonas Profitis', metadata: {…}}18: Document {pageContent: 'Filothei Neapoli Neos Kosmos Omonoia Pangrati (Kal…l:CentralAutoLogin/start?type=1x1] Retrieved from', metadata: {…}}19: Document {pageContent: 'by expanding it.vte\n[//en.wikipedia.org/wiki/Speci…t\nWikipedia Disclaimers Contact Wikipedia Code of', metadata: {…}}20: Document {pageContent: 'and Privacy Policy. Wikipedia® is a registered tra…% 221.619 1 Template:Infobox_settlement"," 25.57%', metadata: {…}}21: Documentmetadata: contentHash: "0ba800d1b576aaa425310398a004eafd096bcb4c5e934ccf9f734a4eb9993029"loc: {lines: {…}}originHash: "d4a39bfd6009e583ebde0b29f88490ac9916e4dd998a15d4a3c947e52ca7ea7e"source: "https://en.wikipedia.org/wiki/Attiki,_Athens"vectorId: undefined[[Prototype]]: ObjectpageContent: "486.669 1 -total\",\" 45.54% 221.619 1 Template:Infobox_settlement\",\" 25.57%\n124.464 1 Template:Infobox\",\" 20.72% 100.846 10 Template:Main_other\",\" 16.03%\n78.010 1 Template:Lang-el\",\" 15.26% 74.275 1 Template:Reflist\",\" 13.34% 64.922 1\nTemplate:Cite_web\",\" 13.31% 64.794 2 Template:Navbox\",\" 9.76% 47.499 1\nTemplate:Coord\",\" 8.15% 39.647 1\nTemplate:Athens\"]},\"scribunto\":{\"limitreport-timeusage\":{\"value\":\"0.270\",\"limit\":\"10.000\"},\"limitreport-memusage\":{\"value\":16258831,\"limit\":52428800}},\"cachereport\":{\"origin\":\"mw1479\",\"timestamp\":\"20231009133134\",\"ttl\":1814400,\"transientcontent\":false}}});});\n{\"@context\":\"https:\\/\\/schema.org\",\"@type\":\"Article\",\"name\":\"Attiki,\nAthens\",\"url\":\"https:\\/\\/en.wikipedia.org\\/wiki\\/Attiki,_Athens\",\"sameAs\":\"http:\\/\\/www.wikidata.org\\/entity\\/Q4818367\",\"mainEntity\":\"http:\\/\\/www.wikidata.org\\/entity\\/Q4818367\",\"author\":{\"@type\":\"Organization\",\"name\":\"Contributors\nto Wikimedia"[[Prototype]]: Object22: Document {pageContent: 'to Wikimedia projects"},"publisher":{"@type":"Orga…svg","headline":"neighborhood\nof Athens, Greece"}', metadata: {…}}length: 23[[Prototype]]: Array(0)
    Failed attempt on vectorize Error: VECTOR_UPSERT_ERROR (code = 4006): Bad Request: Request body JSON schema is invalid; [
    Failed attempt on vectorize Error: VECTOR_UPSERT_ERROR (code = 4006): Bad Request: Request body JSON schema is invalid; [
    {
    "code": "invalid_union",
    "unionErrors": [
  3. rennokki created this gist Oct 10, 2023.
    31 changes: 31 additions & 0 deletions code.ts
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,31 @@
    function setupEmbeddings(env: Env): CloudflareWorkersAIEmbeddings {
    return new CloudflareWorkersAIEmbeddings({
    binding: env.AI as unknown as Fetcher,
    modelName: '@cf/baai/bge-small-en-v1.5',
    stripNewLines: true,
    onFailedAttempt: (error) => {
    console.log('Failed attempt on embed', error);
    }
    });
    };

    function vectorStorage(env: Env): CloudflareVectorizeStore {
    return new CloudflareVectorizeStore(setupEmbeddings(env), {
    index: env.V1_VECTORIZE,
    onFailedAttempt: (error) => {
    console.log('Failed attempt on vectorize', error);
    },
    });
    };

    const rawDocs = await new CheerioWebBaseLoader(content as string).load(); // tried loadAndSplit() too
    const splitter = RecursiveCharacterTextSplitter.fromLanguage('html');
    const sequence = splitter.pipe(new HtmlToTextTransformer());

    const documents = await sequence.invoke(rawDocs);

    // This triggers the above error.
    await vectorStorage(env).addDocuments(documents);



    4,649 changes: 4,649 additions & 0 deletions log
    4,649 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.