JNjenga · January 8, 2022 12:19 · Jan 8, 2022 · Jan 8, 2022 · Jan 8, 2022
diff --git a/diction.py b/diction.py
@@ -1,15 +1,20 @@
 from bs4 import BeautifulSoup
 from collections import OrderedDict
 from urllib import request
+from urllib.error import URLError, HTTPError
 import sys
 import re
 
 def get_text(page):
     """
     Retrieve text from a site
     """
-    soup = BeautifulSoup(page, "html.parser")
-    text = soup.get_text()
+    text = ""
+    try:
+        soup = BeautifulSoup(page, "html.parser")
+        text = soup.get_text()
+    except:
+        print("Unkown error occured while parsing html")
     return text
 
 def generate_dictionary(text):
@@ -51,7 +56,17 @@ def main():
         exit(0)
     url = sys.argv[1]
 
-    page = request.urlopen(url)
+    try:
+        page = request.urlopen(url)
+    except HTTPError as e:
+        print('Error code: ', e.code)
+    except URLError as e:
+        print('Url open error : ', e.reason)
+        print('Try agin or check url')
+    except:
+        print("Unkown error while opening url")
+        print('Try agin or check url')
+
     text = get_text(page)
     sorted_dictionary = sort_dictionary(generate_dictionary(text))
 

diff --git a/diction.py b/diction.py
@@ -21,7 +21,6 @@ def generate_dictionary(text):
         char = text[i]
         if not char.isalpha():
             text.replace(char, " ")
-    #text = re.sub('[^0-9a-zA-Z]+', ' ', text)
     text = re.sub('[^0-9a-zA-Z\']+', ' ', text)
     words = text.split()
     dictionary = {}

diff --git a/diction.py b/diction.py
@@ -0,0 +1,63 @@
+from bs4 import BeautifulSoup
+from collections import OrderedDict
+from urllib import request
+import sys
+import re
+
+def get_text(page):
+    """
+    Retrieve text from a site
+    """
+    soup = BeautifulSoup(page, "html.parser")
+    text = soup.get_text()
+    return text
+
+def generate_dictionary(text):
+    """
+    Generate a dictionary [{string:frequency}]
+    """
+    text = text.lower()
+    for i in range(0, len(text)):
+        char = text[i]
+        if not char.isalpha():
+            text.replace(char, " ")
+    #text = re.sub('[^0-9a-zA-Z]+', ' ', text)
+    text = re.sub('[^0-9a-zA-Z\']+', ' ', text)
+    words = text.split()
+    dictionary = {}
+    for word in words:
+        if not word in dictionary:
+            dictionary[word] = 1
+            continue
+        dictionary[word] += 1
+    return dictionary
+
+def sort_dictionary(dictionary):
+    """
+    Sort dictionary based on key string
+    """
+    sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0]))
+    return sorted_dictionary
+
+def usage():
+    """
+    Print commandline usage message
+    """
+    print("usage: diction.py [-h] [url]")
+    print("e.g diction.py https:://www.pesapal.com")
+
+def main():
+    if len(sys.argv) == 1:
+        usage()
+        exit(0)
+    url = sys.argv[1]
+
+    page = request.urlopen(url)
+    text = get_text(page)
+    sorted_dictionary = sort_dictionary(generate_dictionary(text))
+
+    for k,v in dict(sorted_dictionary).items():
+        print(k + "\t" + str(v), end='\n')
+
+if __name__=='__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+beautifulsoup4==4.10.0
+bs4==0.0.1
+soupsieve==2.3.1
No results found