Skip to content

Instantly share code, notes, and snippets.

@JNjenga
Last active January 8, 2022 12:19
Show Gist options
  • Select an option

  • Save JNjenga/b13f2faabe9102fe64a7bbe0da922117 to your computer and use it in GitHub Desktop.

Select an option

Save JNjenga/b13f2faabe9102fe64a7bbe0da922117 to your computer and use it in GitHub Desktop.

Revisions

  1. JNjenga revised this gist Jan 8, 2022. 1 changed file with 18 additions and 3 deletions.
    21 changes: 18 additions & 3 deletions diction.py
    Original file line number Diff line number Diff line change
    @@ -1,15 +1,20 @@
    from bs4 import BeautifulSoup
    from collections import OrderedDict
    from urllib import request
    from urllib.error import URLError, HTTPError
    import sys
    import re

    def get_text(page):
    """
    Retrieve text from a site
    """
    soup = BeautifulSoup(page, "html.parser")
    text = soup.get_text()
    text = ""
    try:
    soup = BeautifulSoup(page, "html.parser")
    text = soup.get_text()
    except:
    print("Unkown error occured while parsing html")
    return text

    def generate_dictionary(text):
    @@ -51,7 +56,17 @@ def main():
    exit(0)
    url = sys.argv[1]

    page = request.urlopen(url)
    try:
    page = request.urlopen(url)
    except HTTPError as e:
    print('Error code: ', e.code)
    except URLError as e:
    print('Url open error : ', e.reason)
    print('Try agin or check url')
    except:
    print("Unkown error while opening url")
    print('Try agin or check url')

    text = get_text(page)
    sorted_dictionary = sort_dictionary(generate_dictionary(text))

  2. JNjenga revised this gist Jan 8, 2022. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion diction.py
    Original file line number Diff line number Diff line change
    @@ -21,7 +21,6 @@ def generate_dictionary(text):
    char = text[i]
    if not char.isalpha():
    text.replace(char, " ")
    #text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    text = re.sub('[^0-9a-zA-Z\']+', ' ', text)
    words = text.split()
    dictionary = {}
  3. JNjenga created this gist Jan 8, 2022.
    63 changes: 63 additions & 0 deletions diction.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    from bs4 import BeautifulSoup
    from collections import OrderedDict
    from urllib import request
    import sys
    import re

    def get_text(page):
    """
    Retrieve text from a site
    """
    soup = BeautifulSoup(page, "html.parser")
    text = soup.get_text()
    return text

    def generate_dictionary(text):
    """
    Generate a dictionary [{string:frequency}]
    """
    text = text.lower()
    for i in range(0, len(text)):
    char = text[i]
    if not char.isalpha():
    text.replace(char, " ")
    #text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    text = re.sub('[^0-9a-zA-Z\']+', ' ', text)
    words = text.split()
    dictionary = {}
    for word in words:
    if not word in dictionary:
    dictionary[word] = 1
    continue
    dictionary[word] += 1
    return dictionary

    def sort_dictionary(dictionary):
    """
    Sort dictionary based on key string
    """
    sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0]))
    return sorted_dictionary

    def usage():
    """
    Print commandline usage message
    """
    print("usage: diction.py [-h] [url]")
    print("e.g diction.py https:://www.pesapal.com")

    def main():
    if len(sys.argv) == 1:
    usage()
    exit(0)
    url = sys.argv[1]

    page = request.urlopen(url)
    text = get_text(page)
    sorted_dictionary = sort_dictionary(generate_dictionary(text))

    for k,v in dict(sorted_dictionary).items():
    print(k + "\t" + str(v), end='\n')

    if __name__=='__main__':
    main()
    3 changes: 3 additions & 0 deletions requirements.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,3 @@
    beautifulsoup4==4.10.0
    bs4==0.0.1
    soupsieve==2.3.1