codingdudecom · November 1, 2023 06:23 · Nov 1, 2023
diff --git a/nlp.py b/nlp.py
@@ -0,0 +1,74 @@
+			from js import fetch
+			import nltk
+			from nltk.util import ngrams
+			from pathlib import Path
+			import os, sys, io, zipfile
+
+			stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
+			stopwords = stopwords.split(",")
+
+			punkt_downloaded = False
+			async def download_punkt():
+				global punkt_downloaded
+				if not punkt_downloaded:
+					response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
+					js_buffer = await response.arrayBuffer()
+					py_buffer = js_buffer.to_py()  # this is a memoryview
+					stream = py_buffer.tobytes()  # now we have a bytes object
+
+					d = Path("/nltk_data/tokenizers")
+					d.mkdir(parents=True, exist_ok=True)
+
+					Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)
+
+					# extract punkt.zip
+					zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
+					    path='/nltk_data/tokenizers/'
+					)
+					punkt_downloaded = True
+
+
+			async def extract_keywords(text):
+				global punkt_downloaded
+				if not punkt_downloaded:
+					response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
+					js_buffer = await response.arrayBuffer()
+					py_buffer = js_buffer.to_py()  # this is a memoryview
+					stream = py_buffer.tobytes()  # now we have a bytes object
+
+					d = Path("/nltk_data/tokenizers")
+					d.mkdir(parents=True, exist_ok=True)
+
+					Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)
+
+					# extract punkt.zip
+					zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
+					    path='/nltk_data/tokenizers/'
+					)
+					punkt_downloaded = True
+
+				# check file contents in /nltk_data/tokenizers/
+				# print(os.listdir("/nltk_data/tokenizers/punkt"))
+
+				# return nltk.word_tokenize(text)
+				words = nltk.word_tokenize(text)
+				words = [word for word in words if word.isalnum()]
+				filtered_words = [word for word in words if word.lower() not in stopwords]
+
+				# Create bi-grams and tri-grams
+				bigrams = list(ngrams(filtered_words, 2))
+				trigrams = list(ngrams(filtered_words, 3))
+				quadgrams = list(ngrams(filtered_words, 4))
+
+				# Calculate frequency distributions for bi-grams and tri-grams
+				bigram_freq_dist = nltk.FreqDist(bigrams)
+				trigram_freq_dist = nltk.FreqDist(trigrams)
+				quadgram_freq_dist = nltk.FreqDist(quadgrams)
+
+				data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)
+
+				# Get the top N words
+				# top_keywords = [word for word, freq in word_freq.most_common(10)]
+				formatted_data = [[" ".join(keyword), count] for keyword, count in data]
+
+				return formatted_data