Created
November 1, 2023 06:23
-
-
Save codingdudecom/9bc1b19ac82c556331da0c4f2efc7885 to your computer and use it in GitHub Desktop.
Revisions
-
codingdudecom created this gist
Nov 1, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,74 @@ from js import fetch import nltk from nltk.util import ngrams from pathlib import Path import os, sys, io, zipfile stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now" stopwords = stopwords.split(",") punkt_downloaded = False async def download_punkt(): global punkt_downloaded if not punkt_downloaded: response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip') js_buffer = await response.arrayBuffer() py_buffer = js_buffer.to_py() # this is a memoryview stream = py_buffer.tobytes() # now we have a bytes object d = Path("/nltk_data/tokenizers") d.mkdir(parents=True, exist_ok=True) Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream) # extract punkt.zip zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall( path='/nltk_data/tokenizers/' ) punkt_downloaded = True async def extract_keywords(text): global punkt_downloaded if not punkt_downloaded: response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip') js_buffer = await response.arrayBuffer() py_buffer = js_buffer.to_py() # this is a memoryview stream = py_buffer.tobytes() # now we have a bytes object d = Path("/nltk_data/tokenizers") d.mkdir(parents=True, exist_ok=True) Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream) # extract punkt.zip zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall( path='/nltk_data/tokenizers/' ) punkt_downloaded = True # check file contents in /nltk_data/tokenizers/ # print(os.listdir("/nltk_data/tokenizers/punkt")) # return nltk.word_tokenize(text) words = nltk.word_tokenize(text) words = [word for word in words if word.isalnum()] filtered_words = [word for word in words if word.lower() not in stopwords] # Create bi-grams and tri-grams bigrams = list(ngrams(filtered_words, 2)) trigrams = list(ngrams(filtered_words, 3)) quadgrams = list(ngrams(filtered_words, 4)) # Calculate frequency distributions for bi-grams and tri-grams bigram_freq_dist = nltk.FreqDist(bigrams) trigram_freq_dist = nltk.FreqDist(trigrams) quadgram_freq_dist = nltk.FreqDist(quadgrams) data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10) # Get the top N words # top_keywords = [word for word, freq in word_freq.most_common(10)] formatted_data = [[" ".join(keyword), count] for keyword, count in data] return formatted_data