Skip to content

Instantly share code, notes, and snippets.

@codingdudecom
Created November 1, 2023 06:23
Show Gist options
  • Save codingdudecom/9bc1b19ac82c556331da0c4f2efc7885 to your computer and use it in GitHub Desktop.
Save codingdudecom/9bc1b19ac82c556331da0c4f2efc7885 to your computer and use it in GitHub Desktop.

Revisions

  1. codingdudecom created this gist Nov 1, 2023.
    74 changes: 74 additions & 0 deletions nlp.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,74 @@
    from js import fetch
    import nltk
    from nltk.util import ngrams
    from pathlib import Path
    import os, sys, io, zipfile

    stopwords = "i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now"
    stopwords = stopwords.split(",")

    punkt_downloaded = False
    async def download_punkt():
    global punkt_downloaded
    if not punkt_downloaded:
    response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
    js_buffer = await response.arrayBuffer()
    py_buffer = js_buffer.to_py() # this is a memoryview
    stream = py_buffer.tobytes() # now we have a bytes object

    d = Path("/nltk_data/tokenizers")
    d.mkdir(parents=True, exist_ok=True)

    Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

    # extract punkt.zip
    zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
    path='/nltk_data/tokenizers/'
    )
    punkt_downloaded = True


    async def extract_keywords(text):
    global punkt_downloaded
    if not punkt_downloaded:
    response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
    js_buffer = await response.arrayBuffer()
    py_buffer = js_buffer.to_py() # this is a memoryview
    stream = py_buffer.tobytes() # now we have a bytes object

    d = Path("/nltk_data/tokenizers")
    d.mkdir(parents=True, exist_ok=True)

    Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)

    # extract punkt.zip
    zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
    path='/nltk_data/tokenizers/'
    )
    punkt_downloaded = True

    # check file contents in /nltk_data/tokenizers/
    # print(os.listdir("/nltk_data/tokenizers/punkt"))

    # return nltk.word_tokenize(text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalnum()]
    filtered_words = [word for word in words if word.lower() not in stopwords]

    # Create bi-grams and tri-grams
    bigrams = list(ngrams(filtered_words, 2))
    trigrams = list(ngrams(filtered_words, 3))
    quadgrams = list(ngrams(filtered_words, 4))

    # Calculate frequency distributions for bi-grams and tri-grams
    bigram_freq_dist = nltk.FreqDist(bigrams)
    trigram_freq_dist = nltk.FreqDist(trigrams)
    quadgram_freq_dist = nltk.FreqDist(quadgrams)

    data = bigram_freq_dist.most_common(10) + trigram_freq_dist.most_common(10) + quadgram_freq_dist.most_common(10)

    # Get the top N words
    # top_keywords = [word for word, freq in word_freq.most_common(10)]
    formatted_data = [[" ".join(keyword), count] for keyword, count in data]

    return formatted_data