Last active
January 8, 2022 12:19
-
-
Save JNjenga/b13f2faabe9102fe64a7bbe0da922117 to your computer and use it in GitHub Desktop.
Converts website to text and prints a sorted list of unique words and their occurrences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| from collections import OrderedDict | |
| from urllib import request | |
| from urllib.error import URLError, HTTPError | |
| import sys | |
| import re | |
| def get_text(page): | |
| """ | |
| Retrieve text from a site | |
| """ | |
| text = "" | |
| try: | |
| soup = BeautifulSoup(page, "html.parser") | |
| text = soup.get_text() | |
| except: | |
| print("Unkown error occured while parsing html") | |
| return text | |
| def generate_dictionary(text): | |
| """ | |
| Generate a dictionary [{string:frequency}] | |
| """ | |
| text = text.lower() | |
| for i in range(0, len(text)): | |
| char = text[i] | |
| if not char.isalpha(): | |
| text.replace(char, " ") | |
| text = re.sub('[^0-9a-zA-Z\']+', ' ', text) | |
| words = text.split() | |
| dictionary = {} | |
| for word in words: | |
| if not word in dictionary: | |
| dictionary[word] = 1 | |
| continue | |
| dictionary[word] += 1 | |
| return dictionary | |
| def sort_dictionary(dictionary): | |
| """ | |
| Sort dictionary based on key string | |
| """ | |
| sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0])) | |
| return sorted_dictionary | |
| def usage(): | |
| """ | |
| Print commandline usage message | |
| """ | |
| print("usage: diction.py [-h] [url]") | |
| print("e.g diction.py https:://www.pesapal.com") | |
| def main(): | |
| if len(sys.argv) == 1: | |
| usage() | |
| exit(0) | |
| url = sys.argv[1] | |
| try: | |
| page = request.urlopen(url) | |
| except HTTPError as e: | |
| print('Error code: ', e.code) | |
| except URLError as e: | |
| print('Url open error : ', e.reason) | |
| print('Try agin or check url') | |
| except: | |
| print("Unkown error while opening url") | |
| print('Try agin or check url') | |
| text = get_text(page) | |
| sorted_dictionary = sort_dictionary(generate_dictionary(text)) | |
| for k,v in dict(sorted_dictionary).items(): | |
| print(k + "\t" + str(v), end='\n') | |
| if __name__=='__main__': | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| beautifulsoup4==4.10.0 | |
| bs4==0.0.1 | |
| soupsieve==2.3.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment