Last active
January 8, 2022 12:19
-
-
Save JNjenga/b13f2faabe9102fe64a7bbe0da922117 to your computer and use it in GitHub Desktop.
Revisions
-
JNjenga revised this gist
Jan 8, 2022 . 1 changed file with 18 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,15 +1,20 @@ from bs4 import BeautifulSoup from collections import OrderedDict from urllib import request from urllib.error import URLError, HTTPError import sys import re def get_text(page): """ Retrieve text from a site """ text = "" try: soup = BeautifulSoup(page, "html.parser") text = soup.get_text() except: print("Unkown error occured while parsing html") return text def generate_dictionary(text): @@ -51,7 +56,17 @@ def main(): exit(0) url = sys.argv[1] try: page = request.urlopen(url) except HTTPError as e: print('Error code: ', e.code) except URLError as e: print('Url open error : ', e.reason) print('Try agin or check url') except: print("Unkown error while opening url") print('Try agin or check url') text = get_text(page) sorted_dictionary = sort_dictionary(generate_dictionary(text)) -
JNjenga revised this gist
Jan 8, 2022 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -21,7 +21,6 @@ def generate_dictionary(text): char = text[i] if not char.isalpha(): text.replace(char, " ") text = re.sub('[^0-9a-zA-Z\']+', ' ', text) words = text.split() dictionary = {} -
JNjenga created this gist
Jan 8, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,63 @@ from bs4 import BeautifulSoup from collections import OrderedDict from urllib import request import sys import re def get_text(page): """ Retrieve text from a site """ soup = BeautifulSoup(page, "html.parser") text = soup.get_text() return text def generate_dictionary(text): """ Generate a dictionary [{string:frequency}] """ text = text.lower() for i in range(0, len(text)): char = text[i] if not char.isalpha(): text.replace(char, " ") #text = re.sub('[^0-9a-zA-Z]+', ' ', text) text = re.sub('[^0-9a-zA-Z\']+', ' ', text) words = text.split() dictionary = {} for word in words: if not word in dictionary: dictionary[word] = 1 continue dictionary[word] += 1 return dictionary def sort_dictionary(dictionary): """ Sort dictionary based on key string """ sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0])) return sorted_dictionary def usage(): """ Print commandline usage message """ print("usage: diction.py [-h] [url]") print("e.g diction.py https:://www.pesapal.com") def main(): if len(sys.argv) == 1: usage() exit(0) url = sys.argv[1] page = request.urlopen(url) text = get_text(page) sorted_dictionary = sort_dictionary(generate_dictionary(text)) for k,v in dict(sorted_dictionary).items(): print(k + "\t" + str(v), end='\n') if __name__=='__main__': main() This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,3 @@ beautifulsoup4==4.10.0 bs4==0.0.1 soupsieve==2.3.1