Skip to content

Instantly share code, notes, and snippets.

@JNjenga
Last active January 8, 2022 12:19
Show Gist options
  • Select an option

  • Save JNjenga/b13f2faabe9102fe64a7bbe0da922117 to your computer and use it in GitHub Desktop.

Select an option

Save JNjenga/b13f2faabe9102fe64a7bbe0da922117 to your computer and use it in GitHub Desktop.
Converts website to text and prints a sorted list of unique words and their occurrences
from bs4 import BeautifulSoup
from collections import OrderedDict
from urllib import request
from urllib.error import URLError, HTTPError
import sys
import re
def get_text(page):
"""
Retrieve text from a site
"""
text = ""
try:
soup = BeautifulSoup(page, "html.parser")
text = soup.get_text()
except:
print("Unkown error occured while parsing html")
return text
def generate_dictionary(text):
"""
Generate a dictionary [{string:frequency}]
"""
text = text.lower()
for i in range(0, len(text)):
char = text[i]
if not char.isalpha():
text.replace(char, " ")
text = re.sub('[^0-9a-zA-Z\']+', ' ', text)
words = text.split()
dictionary = {}
for word in words:
if not word in dictionary:
dictionary[word] = 1
continue
dictionary[word] += 1
return dictionary
def sort_dictionary(dictionary):
"""
Sort dictionary based on key string
"""
sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0]))
return sorted_dictionary
def usage():
"""
Print commandline usage message
"""
print("usage: diction.py [-h] [url]")
print("e.g diction.py https:://www.pesapal.com")
def main():
if len(sys.argv) == 1:
usage()
exit(0)
url = sys.argv[1]
try:
page = request.urlopen(url)
except HTTPError as e:
print('Error code: ', e.code)
except URLError as e:
print('Url open error : ', e.reason)
print('Try agin or check url')
except:
print("Unkown error while opening url")
print('Try agin or check url')
text = get_text(page)
sorted_dictionary = sort_dictionary(generate_dictionary(text))
for k,v in dict(sorted_dictionary).items():
print(k + "\t" + str(v), end='\n')
if __name__=='__main__':
main()
beautifulsoup4==4.10.0
bs4==0.0.1
soupsieve==2.3.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment