yarko · December 30, 2017 21:40 · Dec 30, 2017 · Dec 28, 2017 · Dec 28, 2017 · Dec 28, 2017
diff --git a/check_links.py b/check_links.py
@@ -87,7 +87,9 @@ def pattern_adjust(link_address, rbase=None):
     global website
     global INTERNAL
 
-    if link_address[0] == '#':  # local
+    # if we're checking local, might as well
+    #   check on-page, too - for typos
+    if link_address[0] == '#' and not INTERNAL:  # local
         return (None, None)
 
     # create a local static var:

diff --git a/check_links.py b/check_links.py
@@ -92,8 +92,6 @@ def pattern_adjust(link_address, rbase=None):
 
     # create a local static var:
     #  - depends on global "website"
-    # TODO:  this has potential to break when
-    #    we loop over multiple URLs
     # if 'rbase' not in pattern_adjust.__dict__ \
     #    pattern_adjust.rbase = urlsplit(website)
 
@@ -102,7 +100,6 @@ def pattern_adjust(link_address, rbase=None):
     if not INTERNAL and \
        r.netloc == rbase.netloc:
         return (None, None)
-    # TODO:  this could possibly follow local links
     # NOTE:  I don't really understand
     #    what this is doing, so annotating:
     #  if relative URL (local)
@@ -195,10 +192,10 @@ def extract_link(address):
 
 if __name__ == "__main__":
     arguments = docopt(__doc__)
+    BASEURL = arguments['--base']
     INTERNAL = arguments['--internal']
     VERBOSE = arguments['--verbose']
     websites = arguments['<url>']  # "https://davericho.com/books/"
-    BASEURL = arguments['--base']
 
     colorama_init()
     # to facilitate checking each link only once

diff --git a/check_links.py b/check_links.py
@@ -3,7 +3,7 @@
 check_link.py
 
 Parses a url, and checks links in it for validity.
-Normally, does not check links to the target URL.
+Normally, does not check links to the source URL's pages.
 Normally, only reports problem links.
 
 Usage:
@@ -13,13 +13,13 @@
     -b BASEURL --base= BASEURL  checking multiple pages on a site?
                                 set the base, and supages to check
                                 ('/' will check BASEURL)
-    -i --internal               check links internal to the site, also;
+    -i --internal               also check links internal to the site;
     -v --verbose                report all link outcomes, good and bad;
     -h --help                   Show this message and exit;
 
 Examples:
-    check_link.py -b https://mysite.io /books /blog   # check only 2 subpages
-    check_link.py -b https://mysite.io / /blog   # check home page too
+    check_link.py -b https://mysite.io books blog   # check only 2 subpages
+    check_link.py -b https://mysite.io / /blog      # check home page too
 '''
 import sys
 import requests

diff --git a/check_links.py b/check_links.py
@@ -7,12 +7,19 @@
 Normally, only reports problem links.
 
 Usage:
-    check_link.py [-i] [-v] <url>...
+    check_link.py [-i] [-v] [-b BASEURL] <url>...
 
 Options:
-    -i --internal   check links internal to the site, also;
-    -v --verbose    report all link outcomes, good and bad;
-    -h --help       Show this message and exit;
+    -b BASEURL --base= BASEURL  checking multiple pages on a site?
+                                set the base, and supages to check
+                                ('/' will check BASEURL)
+    -i --internal               check links internal to the site, also;
+    -v --verbose                report all link outcomes, good and bad;
+    -h --help                   Show this message and exit;
+
+Examples:
+    check_link.py -b https://mysite.io /books /blog   # check only 2 subpages
+    check_link.py -b https://mysite.io / /blog   # check home page too
 '''
 import sys
 import requests
@@ -191,6 +198,7 @@ def extract_link(address):
     INTERNAL = arguments['--internal']
     VERBOSE = arguments['--verbose']
     websites = arguments['<url>']  # "https://davericho.com/books/"
+    BASEURL = arguments['--base']
 
     colorama_init()
     # to facilitate checking each link only once
@@ -203,5 +211,7 @@ def extract_link(address):
     session = requests.Session()
     session.headers.update({'User-Agent': 'test'})
     for website in websites:
+        if BASEURL:
+            website = urljoin(BASEURL, website)
         print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}')
         result = extract_link(website)
diff --git a/check_links.py b/check_links.py
@@ -63,7 +63,7 @@ def check(address, netloc):
         if resp.status_code == 301:
             newaddress = urljoin(address,
                                  resp.headers["Location"].split(";")[0])
-            msg += f'\n      NEW: => {newaddress}'
+            msg += f'\n{" "*19}NEW: => {newaddress}'
     elif VERBOSE:
         msg = f'{Fore.GREEN}{resp.status_code} - ' \
               f'{resp.reason} => {address}{Fore.RESET}'

diff --git a/check_links.py b/check_links.py
@@ -14,6 +14,7 @@
     -v --verbose    report all link outcomes, good and bad;
     -h --help       Show this message and exit;
 '''
+import sys
 import requests
 from urllib.parse import urljoin, urlsplit, urlunsplit
 from docopt import docopt
@@ -51,7 +52,7 @@ def check(address, netloc):
             resp = session.get(address)
             FULL_GET.add(netloc)
     except Exception as e:
-        return f'{Fore.YELLOW}{e} - {address}'
+        return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}'
 
     if resp.status_code in \
             [301, 308,
@@ -64,7 +65,8 @@ def check(address, netloc):
                                  resp.headers["Location"].split(";")[0])
             msg += f'\n      NEW: => {newaddress}'
     elif VERBOSE:
-        msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}'
+        msg = f'{Fore.GREEN}{resp.status_code} - ' \
+              f'{resp.reason} => {address}{Fore.RESET}'
     return msg
 
 
@@ -122,6 +124,44 @@ def pattern_adjust(link_address, rbase=None):
         return (link_address, r.netloc)
 
 
+def string_trunc(s, field_width=73, fill='.'):
+    '''
+    usage:
+        s, f, w = string_trunc(longurl)
+        print(f']{s:{f}<{w}}[')
+    returns:
+        a truncated (if needed) string,
+        a fill char, and
+        a matching field_width
+    '''
+    str_width = len(s)
+
+    if str_width > field_width:
+        # room for 3 fill chars
+        return s[:field_width-3], fill, field_width
+    else:
+        # this doesn't work: need real values
+        # return s, None, None
+        # width of zero seems to not do width,
+        #   but I need returned str_width to clear progress line;
+        # fill could be anything (another "0", but...)
+        return s, " ", str_width
+
+
+def progress(msg):
+    '''
+    hack to print progress
+    '''
+    if 'w' not in progress.__dict__:
+        progress.w = 0
+
+    # clear previous progress line
+    print(f'\r{" "*progress.w}', end='', file=sys.stderr)
+    s, f, w = string_trunc(msg)
+    print(f'\r{s:{f}<{w}}', end='', file=sys.stderr)
+    progress.w = w
+
+
 def extract_link(address):
     global link_status
     global session
@@ -135,10 +175,14 @@ def extract_link(address):
         for link in BeautifulSoup(response.content, "html.parser",
                                   parse_only=SoupStrainer(key)):
             if link.has_attr(value):
+                # I'm jonesin' for some progress indicators
+                progress(link[value])
                 p, netloc = pattern_adjust(link[value], rbase)
                 if p and p not in link_status:
                     link_status[p] = check(p, netloc)
                     if link_status[p]:
+                        # the '\r' is a hack to show stdout ok w/ progress msgs
+                        print('\r', end='', file=sys.stderr)
                         print(link_status[p])
 
 
@@ -159,5 +203,5 @@ def extract_link(address):
     session = requests.Session()
     session.headers.update({'User-Agent': 'test'})
     for website in websites:
-        print(f'{Fore.CYAN}--- checking links on {website} ---')
+        print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}')
         result = extract_link(website)
diff --git a/check_links.py b/check_links.py
@@ -21,9 +21,9 @@
 from colorama import Fore
 from bs4 import BeautifulSoup, SoupStrainer
 
-# this is a modification of a pretty terrible gist,
-#   but it got me quickly started for now.
-#  Original (I may delete this link eventually):
+# this is a modification of a gist;
+#   it got me quickly started for now.
+#  Original:
 #  [email protected]:2872d7f994d192188970408980267e6e.git
 
 
@@ -116,7 +116,8 @@ def pattern_adjust(link_address, rbase=None):
     elif r.scheme == '' and r.netloc == '':
         # is this what I want to do?
         #  would I rather do urljoin(urlunsplit(rbase), link_address)?
-        return (urljoin(website, link_address), r.netloc) if INTERNAL else (None, None)
+        return (urljoin(website, link_address), r.netloc) \
+                if INTERNAL else (None, None)
     else:
         return (link_address, r.netloc)
 

diff --git a/check_links.py b/check_links.py
@@ -148,13 +148,14 @@ def extract_link(address):
     websites = arguments['<url>']  # "https://davericho.com/books/"
 
     colorama_init()
+    # to facilitate checking each link only once
     link_status = {}
     # sites which don't accept 'head' requests (dynamic)
     #  populate with results of urlsplit().netloc
     FULL_GET = ('www.amazon.com',)
-    session = requests.Session()
     # for places like amazon.com, which will deny python scripts:
     #  Now - use session throughout this script!
+    session = requests.Session()
     session.headers.update({'User-Agent': 'test'})
     for website in websites:
         print(f'{Fore.CYAN}--- checking links on {website} ---')

diff --git a/check_links.py b/check_links.py
@@ -7,96 +7,155 @@
 Normally, only reports problem links.
 
 Usage:
-    check_link.py [-i] [-v] <url>
+    check_link.py [-i] [-v] <url>...
 
 Options:
     -i --internal   check links internal to the site, also;
     -v --verbose    report all link outcomes, good and bad;
     -h --help       Show this message and exit;
 '''
 import requests
-from urllib.parse import *
+from urllib.parse import urljoin, urlsplit, urlunsplit
 from docopt import docopt
-import colorama,re,queue,threading
+from colorama import init as colorama_init
 from colorama import Fore
-from bs4 import BeautifulSoup,SoupStrainer
+from bs4 import BeautifulSoup, SoupStrainer
 
 # this is a modification of a pretty terrible gist,
 #   but it got me quickly started for now.
 #  Original (I may delete this link eventually):
 #  [email protected]:2872d7f994d192188970408980267e6e.git
 
-def check(address):
+
+def check(address, netloc):
     global VERBOSE
+    global FULL_GET
+    global session
+    msg = None  # the normal "ok" is no message
+
+    # optimize which retrieve we use:
+    retrieve = session.get if netloc in FULL_GET else session.head
     try:
-        resp=requests.head(address)
-        if resp.status_code in [400,404,403,408,409,501,502,503]:
-            return f'{Fore.RED}{resp.status_code} - {resp.reason} --> {address}'
-        elif VERBOSE:
-            return f'{Fore.GREEN}no problem in --> {address}'
-        else:
-            return None
+        # NOTE: amazon denies requests from python scripts, so we use
+        #    a session with an updated 'User-Agent' throughout ('session')
+        #    amazon.com remembers if the session.get() was from a python agent,
+        #    and then denies the session.get(), even if it updated
+        #    its 'user-agent' with a 503 - Service Unavailable
+        # OPTIMIZATION:
+        #    we try a light-weight session.head() call first;
+        #    if it fails for a domain w/ 405 (Method Not Allowed), then
+        #    we retry with a full session.get(), and log the location so
+        #    we always try the long way;
+        resp = retrieve(address)
+        if resp.status_code == 405:
+            resp = session.get(address)
+            FULL_GET.add(netloc)
     except Exception as e:
         return f'{Fore.YELLOW}{e} - {address}'
-        pass
 
-def pattern_adjust(link_address):
+    if resp.status_code in \
+            [301, 308,
+             400, 401, 402, 403, 404, 405, 408, 409, 410,
+             501, 502, 503]:
+        msg = f'{resp.status_code} - {resp.reason} => {address}'
+        # TODO: scrub other permanent redirection codes to include in this:
+        if resp.status_code == 301:
+            newaddress = urljoin(address,
+                                 resp.headers["Location"].split(";")[0])
+            msg += f'\n      NEW: => {newaddress}'
+    elif VERBOSE:
+        msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}'
+    return msg
+
+
+def pattern_adjust(link_address, rbase=None):
     '''
-    don't follow local addresses, unless specifically
-    set option to follow internal addresses
+    returns  "adjusted" address and netloc;
+
+    don't follow local addresses, unless
+    option set to follow internal addresses
     '''
-    if re.match('^#' ,link_address):  # local
-        return 0
+    global website
+    global INTERNAL
+
+    if link_address[0] == '#':  # local
+        return (None, None)
 
     # create a local static var:
     #  - depends on global "website"
-    if 'rbase' not in pattern_adjust.__dict__:
-        global website
-        global INTERNAL
-        pattern_adjust.rbase = urlsplit(website)
+    # TODO:  this has potential to break when
+    #    we loop over multiple URLs
+    # if 'rbase' not in pattern_adjust.__dict__ \
+    #    pattern_adjust.rbase = urlsplit(website)
 
-    r=urlsplit(link_address)
+    r = urlsplit(link_address)
     # don't follow local:
     if not INTERNAL and \
-        r.netloc == pattern_adjust.rbase.netloc:
-            return 0
+       r.netloc == rbase.netloc:
+        return (None, None)
     # TODO:  this could possibly follow local links
-    if r.scheme=='' and (r.netloc!='' or r.path!=''):
-        d=urlunsplit(r)
-        if re.match('^//' ,d):
-            m= re.search('(?<=//)\S+', d)
-            d=m.group(0)
-            return "https://"+d
-    elif r.scheme=='' and r.netloc=='':
-        return websit+link_address if INTERNAL else 0
+    # NOTE:  I don't really understand
+    #    what this is doing, so annotating:
+    #  if relative URL (local)
+    # TODO: I am getting convinced what this wants to do
+    #   should be done w/ a simple urljoin()
+    #   I'm also thinking this code branch isn't traversed;
+    if r.scheme == '' and (r.netloc != '' or r.path != ''):
+        # reconstitute - it won't be a full path
+        d = urlunsplit(r)
+        # This if seems exceedingly wonky
+        if d.startswith('//'):
+            # if it starts with '//', throw that away...
+            # m = re.search('(?<=//)\S+', d)
+            # d = m.group(0)
+            # TODO:  if r.netloc is empty, then this
+            #  could result in an incorrect URL:
+            # => if address = foo.com/something - then ok
+            # => if address relaive: ./static/something - then trouble
+            return ("https://" + d[2:], r.netloc)
+    elif r.scheme == '' and r.netloc == '':
+        # is this what I want to do?
+        #  would I rather do urljoin(urlunsplit(rbase), link_address)?
+        return (urljoin(website, link_address), r.netloc) if INTERNAL else (None, None)
     else:
-        return link_address
+        return (link_address, r.netloc)
 
 
 def extract_link(address):
     global link_status
-    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
-    response=requests.get(address)
-    # for key,value in iter(list(tags.items())):
+    global session
+
+    tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'}
+
+    # the partitioned pieces of URL we're checking
+    rbase = urlsplit(address)
+    response = session.get(address)
     for key, value in tags.items():
-        try:
-            for link in BeautifulSoup(response.content,"html.parser",parse_only=SoupStrainer(key)):
-                if link.has_attr(value):
-                    p=pattern_adjust(link[value])
-                    if p and not p in link_status:
-                        link_status[p] = check(p)
-                        if link_status[p]:
-                            print(link_status[p])
-        except Exception as e:
-            print((e,address))
-
-
-if __name__=="__main__":
+        for link in BeautifulSoup(response.content, "html.parser",
+                                  parse_only=SoupStrainer(key)):
+            if link.has_attr(value):
+                p, netloc = pattern_adjust(link[value], rbase)
+                if p and p not in link_status:
+                    link_status[p] = check(p, netloc)
+                    if link_status[p]:
+                        print(link_status[p])
+
+
+if __name__ == "__main__":
     arguments = docopt(__doc__)
     INTERNAL = arguments['--internal']
     VERBOSE = arguments['--verbose']
+    websites = arguments['<url>']  # "https://davericho.com/books/"
 
-    colorama.init()
+    colorama_init()
     link_status = {}
-    website = arguments['<url>']  # "https://davericho.com/books/"
-    result = extract_link(website)
+    # sites which don't accept 'head' requests (dynamic)
+    #  populate with results of urlsplit().netloc
+    FULL_GET = ('www.amazon.com',)
+    session = requests.Session()
+    # for places like amazon.com, which will deny python scripts:
+    #  Now - use session throughout this script!
+    session.headers.update({'User-Agent': 'test'})
+    for website in websites:
+        print(f'{Fore.CYAN}--- checking links on {website} ---')
+        result = extract_link(website)
diff --git a/check_link.py → check_links.py b/check_link.py → check_links.py
diff --git a/check_link.py b/check_link.py
@@ -1,73 +1,102 @@
-from bs4 import BeautifulSoup,SoupStrainer
-import urllib.request
+#!/usr/bin/env python
+'''
+check_link.py
+
+Parses a url, and checks links in it for validity.
+Normally, does not check links to the target URL.
+Normally, only reports problem links.
+
+Usage:
+    check_link.py [-i] [-v] <url>
+
+Options:
+    -i --internal   check links internal to the site, also;
+    -v --verbose    report all link outcomes, good and bad;
+    -h --help       Show this message and exit;
+'''
+import requests
+from urllib.parse import *
+from docopt import docopt
 import colorama,re,queue,threading
 from colorama import Fore
-from urllib.parse import *
+from bs4 import BeautifulSoup,SoupStrainer
 
-class check_link():
-    def __init__(self,address):
-        self.address=address        
-    def check(self,address):   
-        try:
-            req=urllib.request.Request(url=address)
-            resp=urllib.request.urlopen(req)
-            if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)               
-            else: print (Fore.GREEN+"no problem in-->"+address)
-
-        except Exception as e:
-            print (Fore.YELLOW+"{}-{}".format(e,address))
-            pass   
-def pattern_adjust(a):  
+# this is a modification of a pretty terrible gist,
+#   but it got me quickly started for now.
+#  Original (I may delete this link eventually):
+#  [email protected]:2872d7f994d192188970408980267e6e.git
+
+def check(address):
+    global VERBOSE
     try:
-        if re.match('^#' ,a):return 0 
-        r=urlsplit(a)
-        if r.scheme=='' and (r.netloc!='' or r.path!=''):
-            d=urlunsplit(r)
-            if re.match('^//' ,d):
-                m= re.search('(?<=//)\S+', d)
-                d=m.group(0)  
-                m="https://"+d
-                return m
-        elif r.scheme=='' and r.netloc=='':
-            return address+a
-        else:return a
+        resp=requests.head(address)
+        if resp.status_code in [400,404,403,408,409,501,502,503]:
+            return f'{Fore.RED}{resp.status_code} - {resp.reason} --> {address}'
+        elif VERBOSE:
+            return f'{Fore.GREEN}no problem in --> {address}'
+        else:
+            return None
     except Exception as e:
+        return f'{Fore.YELLOW}{e} - {address}'
         pass
+
+def pattern_adjust(link_address):
+    '''
+    don't follow local addresses, unless specifically
+    set option to follow internal addresses
+    '''
+    if re.match('^#' ,link_address):  # local
+        return 0
+
+    # create a local static var:
+    #  - depends on global "website"
+    if 'rbase' not in pattern_adjust.__dict__:
+        global website
+        global INTERNAL
+        pattern_adjust.rbase = urlsplit(website)
+
+    r=urlsplit(link_address)
+    # don't follow local:
+    if not INTERNAL and \
+        r.netloc == pattern_adjust.rbase.netloc:
+            return 0
+    # TODO:  this could possibly follow local links
+    if r.scheme=='' and (r.netloc!='' or r.path!=''):
+        d=urlunsplit(r)
+        if re.match('^//' ,d):
+            m= re.search('(?<=//)\S+', d)
+            d=m.group(0)
+            return "https://"+d
+    elif r.scheme=='' and r.netloc=='':
+        return websit+link_address if INTERNAL else 0
+    else:
+        return link_address
+
+
 def extract_link(address):
+    global link_status
     tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
-    for key,value in iter(tags.items()):    
+    response=requests.get(address)
+    # for key,value in iter(list(tags.items())):
+    for key, value in tags.items():
         try:
-            res=urllib.request.urlopen(address)
-            response=res.read().decode('utf-8') #needs improvement
-            for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)): 
+            for link in BeautifulSoup(response.content,"html.parser",parse_only=SoupStrainer(key)):
                 if link.has_attr(value):
                     p=pattern_adjust(link[value])
-                    if p!=0 and str(p)!='None':        
-                        newcheck=check_link(p)
-                        newcheck.check(p)
-                        if p not in hyperlinks:
-                            hyperlinks.add(p)
-                            if website.split('.')[1] in p:#needs improvement
-                                if not website.endswith(('.png','.jpeg','.js','jpg')):
-                                    q.put(p)                    
+                    if p and not p in link_status:
+                        link_status[p] = check(p)
+                        if link_status[p]:
+                            print(link_status[p])
         except Exception as e:
-            print (e,address)                                
-def threader():
-    while True:
-        value=q.get()  
-        result=extract_link(value)
-        q.task_done()
+            print((e,address))
+
 
 if __name__=="__main__":
-    colorama.init()
-    q=queue.Queue()
-    global hyperlinks,website
-    hyperlinks=set()
-    website=input("Please enter the website address: ") 
-    for x in range(30):
-        t=threading.Thread(target=threader)
-        t.deamon=True
-        t.start()   
-    q.put(website.strip())
-    q.join()
+    arguments = docopt(__doc__)
+    INTERNAL = arguments['--internal']
+    VERBOSE = arguments['--verbose']
 
+    colorama.init()
+    link_status = {}
+    website = arguments['<url>']  # "https://davericho.com/books/"
+    result = extract_link(website)
diff --git a/check_link.py b/check_link.py
@@ -0,0 +1,73 @@
+from bs4 import BeautifulSoup,SoupStrainer
+import urllib.request
+import colorama,re,queue,threading
+from colorama import Fore
+from urllib.parse import *
+
+class check_link():
+    def __init__(self,address):
+        self.address=address        
+    def check(self,address):   
+        try:
+            req=urllib.request.Request(url=address)
+            resp=urllib.request.urlopen(req)
+            if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)               
+            else: print (Fore.GREEN+"no problem in-->"+address)
+
+        except Exception as e:
+            print (Fore.YELLOW+"{}-{}".format(e,address))
+            pass   
+def pattern_adjust(a):  
+    try:
+        if re.match('^#' ,a):return 0 
+        r=urlsplit(a)
+        if r.scheme=='' and (r.netloc!='' or r.path!=''):
+            d=urlunsplit(r)
+            if re.match('^//' ,d):
+                m= re.search('(?<=//)\S+', d)
+                d=m.group(0)  
+                m="https://"+d
+                return m
+        elif r.scheme=='' and r.netloc=='':
+            return address+a
+        else:return a
+    except Exception as e:
+        pass
+def extract_link(address):
+    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
+    for key,value in iter(tags.items()):    
+        try:
+            res=urllib.request.urlopen(address)
+            response=res.read().decode('utf-8') #needs improvement
+            for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)): 
+                if link.has_attr(value):
+                    p=pattern_adjust(link[value])
+                    if p!=0 and str(p)!='None':        
+                        newcheck=check_link(p)
+                        newcheck.check(p)
+                        if p not in hyperlinks:
+                            hyperlinks.add(p)
+                            if website.split('.')[1] in p:#needs improvement
+                                if not website.endswith(('.png','.jpeg','.js','jpg')):
+                                    q.put(p)                    
+        except Exception as e:
+            print (e,address)                                
+def threader():
+    while True:
+        value=q.get()  
+        result=extract_link(value)
+        q.task_done()
+
+if __name__=="__main__":
+    colorama.init()
+    q=queue.Queue()
+    global hyperlinks,website
+    hyperlinks=set()
+    website=input("Please enter the website address: ") 
+    for x in range(30):
+        t=threading.Thread(target=threader)
+        t.deamon=True
+        t.start()   
+    q.put(website.strip())
+    q.join()
+