from js import fetch import nltk from nltk.util import ngrams from pathlib import Path import os, sys, io, zipfile stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now" stopwords = stopwords.split(",") punkt_downloaded = False async def download_punkt(): global punkt_downloaded if not punkt_downloaded: response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip') js_buffer = await response.arrayBuffer() py_buffer = js_buffer.to_py() # this is a memoryview stream = py_buffer.tobytes() # now we have a bytes object d = Path("/nltk_data/tokenizers") d.mkdir(parents=True, exist_ok=True) Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream) # extract punkt.zip zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall( path='/nltk_data/tokenizers/' ) punkt_downloaded = True async def extract_keywords(text): global punkt_downloaded if not punkt_downloaded: response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip') js_buffer = await response.arrayBuffer() py_buffer = js_buffer.to_py() # this is a memoryview stream = py_buffer.tobytes() # now we have a bytes object d = Path("/nltk_data/tokenizers") d.mkdir(parents=True, exist_ok=True) Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream) # extract punkt.zip zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall( path='/nltk_data/tokenizers/' ) punkt_downloaded = True # check file contents in /nltk_data/tokenizers/ # print(os.listdir("/nltk_data/tokenizers/punkt")) # return nltk.word_tokenize(text) words = nltk.word_tokenize(text) words = [word for word in words if word.isalnum()] filtered_words = [word for word in words if word.lower() not in stopwords] # Create bi-grams and tri-grams bigrams = list(ngrams(filtered_words, 2)) trigrams = list(ngrams(filtered_words, 3)) quadgrams = list(ngrams(filtered_words, 4)) # Calculate frequency distributions for bi-grams and tri-grams bigram_freq_dist = nltk.FreqDist(bigrams) trigram_freq_dist = nltk.FreqDist(trigrams) quadgram_freq_dist = nltk.FreqDist(quadgrams) data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10) # Get the top N words # top_keywords = [word for word, freq in word_freq.most_common(10)] formatted_data = [[" ".join(keyword), count] for keyword, count in data] return formatted_data