Skip to content

Instantly share code, notes, and snippets.

@yarko
Last active December 30, 2017 21:40
Show Gist options
  • Save yarko/f113f9f9948480fc66c5f566a301a27d to your computer and use it in GitHub Desktop.
Save yarko/f113f9f9948480fc66c5f566a301a27d to your computer and use it in GitHub Desktop.

Revisions

  1. yarko revised this gist Dec 30, 2017. 1 changed file with 3 additions and 1 deletion.
    4 changes: 3 additions & 1 deletion check_links.py
    Original file line number Diff line number Diff line change
    @@ -87,7 +87,9 @@ def pattern_adjust(link_address, rbase=None):
    global website
    global INTERNAL

    if link_address[0] == '#': # local
    # if we're checking local, might as well
    # check on-page, too - for typos
    if link_address[0] == '#' and not INTERNAL: # local
    return (None, None)

    # create a local static var:
  2. yarko revised this gist Dec 28, 2017. 1 changed file with 1 addition and 4 deletions.
    5 changes: 1 addition & 4 deletions check_links.py
    Original file line number Diff line number Diff line change
    @@ -92,8 +92,6 @@ def pattern_adjust(link_address, rbase=None):

    # create a local static var:
    # - depends on global "website"
    # TODO: this has potential to break when
    # we loop over multiple URLs
    # if 'rbase' not in pattern_adjust.__dict__ \
    # pattern_adjust.rbase = urlsplit(website)

    @@ -102,7 +100,6 @@ def pattern_adjust(link_address, rbase=None):
    if not INTERNAL and \
    r.netloc == rbase.netloc:
    return (None, None)
    # TODO: this could possibly follow local links
    # NOTE: I don't really understand
    # what this is doing, so annotating:
    # if relative URL (local)
    @@ -195,10 +192,10 @@ def extract_link(address):

    if __name__ == "__main__":
    arguments = docopt(__doc__)
    BASEURL = arguments['--base']
    INTERNAL = arguments['--internal']
    VERBOSE = arguments['--verbose']
    websites = arguments['<url>'] # "https://davericho.com/books/"
    BASEURL = arguments['--base']

    colorama_init()
    # to facilitate checking each link only once
  3. yarko revised this gist Dec 28, 2017. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions check_links.py
    Original file line number Diff line number Diff line change
    @@ -3,7 +3,7 @@
    check_link.py
    Parses a url, and checks links in it for validity.
    Normally, does not check links to the target URL.
    Normally, does not check links to the source URL's pages.
    Normally, only reports problem links.
    Usage:
    @@ -13,13 +13,13 @@
    -b BASEURL --base= BASEURL checking multiple pages on a site?
    set the base, and supages to check
    ('/' will check BASEURL)
    -i --internal check links internal to the site, also;
    -i --internal also check links internal to the site;
    -v --verbose report all link outcomes, good and bad;
    -h --help Show this message and exit;
    Examples:
    check_link.py -b https://mysite.io /books /blog # check only 2 subpages
    check_link.py -b https://mysite.io / /blog # check home page too
    check_link.py -b https://mysite.io books blog # check only 2 subpages
    check_link.py -b https://mysite.io / /blog # check home page too
    '''
    import sys
    import requests
  4. yarko revised this gist Dec 28, 2017. 1 changed file with 14 additions and 4 deletions.
    18 changes: 14 additions & 4 deletions check_links.py
    Original file line number Diff line number Diff line change
    @@ -7,12 +7,19 @@
    Normally, only reports problem links.
    Usage:
    check_link.py [-i] [-v] <url>...
    check_link.py [-i] [-v] [-b BASEURL] <url>...
    Options:
    -i --internal check links internal to the site, also;
    -v --verbose report all link outcomes, good and bad;
    -h --help Show this message and exit;
    -b BASEURL --base= BASEURL checking multiple pages on a site?
    set the base, and supages to check
    ('/' will check BASEURL)
    -i --internal check links internal to the site, also;
    -v --verbose report all link outcomes, good and bad;
    -h --help Show this message and exit;
    Examples:
    check_link.py -b https://mysite.io /books /blog # check only 2 subpages
    check_link.py -b https://mysite.io / /blog # check home page too
    '''
    import sys
    import requests
    @@ -191,6 +198,7 @@ def extract_link(address):
    INTERNAL = arguments['--internal']
    VERBOSE = arguments['--verbose']
    websites = arguments['<url>'] # "https://davericho.com/books/"
    BASEURL = arguments['--base']

    colorama_init()
    # to facilitate checking each link only once
    @@ -203,5 +211,7 @@ def extract_link(address):
    session = requests.Session()
    session.headers.update({'User-Agent': 'test'})
    for website in websites:
    if BASEURL:
    website = urljoin(BASEURL, website)
    print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}')
    result = extract_link(website)
  5. yarko revised this gist Dec 28, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion check_links.py
    Original file line number Diff line number Diff line change
    @@ -63,7 +63,7 @@ def check(address, netloc):
    if resp.status_code == 301:
    newaddress = urljoin(address,
    resp.headers["Location"].split(";")[0])
    msg += f'\n NEW: => {newaddress}'
    msg += f'\n{" "*19}NEW: => {newaddress}'
    elif VERBOSE:
    msg = f'{Fore.GREEN}{resp.status_code} - ' \
    f'{resp.reason} => {address}{Fore.RESET}'
  6. yarko revised this gist Dec 28, 2017. 1 changed file with 47 additions and 3 deletions.
    50 changes: 47 additions & 3 deletions check_links.py
    Original file line number Diff line number Diff line change
    @@ -14,6 +14,7 @@
    -v --verbose report all link outcomes, good and bad;
    -h --help Show this message and exit;
    '''
    import sys
    import requests
    from urllib.parse import urljoin, urlsplit, urlunsplit
    from docopt import docopt
    @@ -51,7 +52,7 @@ def check(address, netloc):
    resp = session.get(address)
    FULL_GET.add(netloc)
    except Exception as e:
    return f'{Fore.YELLOW}{e} - {address}'
    return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}'

    if resp.status_code in \
    [301, 308,
    @@ -64,7 +65,8 @@ def check(address, netloc):
    resp.headers["Location"].split(";")[0])
    msg += f'\n NEW: => {newaddress}'
    elif VERBOSE:
    msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}'
    msg = f'{Fore.GREEN}{resp.status_code} - ' \
    f'{resp.reason} => {address}{Fore.RESET}'
    return msg


    @@ -122,6 +124,44 @@ def pattern_adjust(link_address, rbase=None):
    return (link_address, r.netloc)


    def string_trunc(s, field_width=73, fill='.'):
    '''
    usage:
    s, f, w = string_trunc(longurl)
    print(f']{s:{f}<{w}}[')
    returns:
    a truncated (if needed) string,
    a fill char, and
    a matching field_width
    '''
    str_width = len(s)

    if str_width > field_width:
    # room for 3 fill chars
    return s[:field_width-3], fill, field_width
    else:
    # this doesn't work: need real values
    # return s, None, None
    # width of zero seems to not do width,
    # but I need returned str_width to clear progress line;
    # fill could be anything (another "0", but...)
    return s, " ", str_width


    def progress(msg):
    '''
    hack to print progress
    '''
    if 'w' not in progress.__dict__:
    progress.w = 0

    # clear previous progress line
    print(f'\r{" "*progress.w}', end='', file=sys.stderr)
    s, f, w = string_trunc(msg)
    print(f'\r{s:{f}<{w}}', end='', file=sys.stderr)
    progress.w = w


    def extract_link(address):
    global link_status
    global session
    @@ -135,10 +175,14 @@ def extract_link(address):
    for link in BeautifulSoup(response.content, "html.parser",
    parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    # I'm jonesin' for some progress indicators
    progress(link[value])
    p, netloc = pattern_adjust(link[value], rbase)
    if p and p not in link_status:
    link_status[p] = check(p, netloc)
    if link_status[p]:
    # the '\r' is a hack to show stdout ok w/ progress msgs
    print('\r', end='', file=sys.stderr)
    print(link_status[p])


    @@ -159,5 +203,5 @@ def extract_link(address):
    session = requests.Session()
    session.headers.update({'User-Agent': 'test'})
    for website in websites:
    print(f'{Fore.CYAN}--- checking links on {website} ---')
    print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}')
    result = extract_link(website)
  7. yarko revised this gist Dec 27, 2017. 1 changed file with 5 additions and 4 deletions.
    9 changes: 5 additions & 4 deletions check_links.py
    Original file line number Diff line number Diff line change
    @@ -21,9 +21,9 @@
    from colorama import Fore
    from bs4 import BeautifulSoup, SoupStrainer

    # this is a modification of a pretty terrible gist,
    # but it got me quickly started for now.
    # Original (I may delete this link eventually):
    # this is a modification of a gist;
    # it got me quickly started for now.
    # Original:
    # [email protected]:2872d7f994d192188970408980267e6e.git


    @@ -116,7 +116,8 @@ def pattern_adjust(link_address, rbase=None):
    elif r.scheme == '' and r.netloc == '':
    # is this what I want to do?
    # would I rather do urljoin(urlunsplit(rbase), link_address)?
    return (urljoin(website, link_address), r.netloc) if INTERNAL else (None, None)
    return (urljoin(website, link_address), r.netloc) \
    if INTERNAL else (None, None)
    else:
    return (link_address, r.netloc)

  8. yarko revised this gist Dec 27, 2017. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion check_links.py
    Original file line number Diff line number Diff line change
    @@ -148,13 +148,14 @@ def extract_link(address):
    websites = arguments['<url>'] # "https://davericho.com/books/"

    colorama_init()
    # to facilitate checking each link only once
    link_status = {}
    # sites which don't accept 'head' requests (dynamic)
    # populate with results of urlsplit().netloc
    FULL_GET = ('www.amazon.com',)
    session = requests.Session()
    # for places like amazon.com, which will deny python scripts:
    # Now - use session throughout this script!
    session = requests.Session()
    session.headers.update({'User-Agent': 'test'})
    for website in websites:
    print(f'{Fore.CYAN}--- checking links on {website} ---')
  9. yarko revised this gist Dec 27, 2017. 1 changed file with 112 additions and 53 deletions.
    165 changes: 112 additions & 53 deletions check_links.py
    Original file line number Diff line number Diff line change
    @@ -7,96 +7,155 @@
    Normally, only reports problem links.
    Usage:
    check_link.py [-i] [-v] <url>
    check_link.py [-i] [-v] <url>...
    Options:
    -i --internal check links internal to the site, also;
    -v --verbose report all link outcomes, good and bad;
    -h --help Show this message and exit;
    '''
    import requests
    from urllib.parse import *
    from urllib.parse import urljoin, urlsplit, urlunsplit
    from docopt import docopt
    import colorama,re,queue,threading
    from colorama import init as colorama_init
    from colorama import Fore
    from bs4 import BeautifulSoup,SoupStrainer
    from bs4 import BeautifulSoup, SoupStrainer

    # this is a modification of a pretty terrible gist,
    # but it got me quickly started for now.
    # Original (I may delete this link eventually):
    # [email protected]:2872d7f994d192188970408980267e6e.git

    def check(address):

    def check(address, netloc):
    global VERBOSE
    global FULL_GET
    global session
    msg = None # the normal "ok" is no message

    # optimize which retrieve we use:
    retrieve = session.get if netloc in FULL_GET else session.head
    try:
    resp=requests.head(address)
    if resp.status_code in [400,404,403,408,409,501,502,503]:
    return f'{Fore.RED}{resp.status_code} - {resp.reason} --> {address}'
    elif VERBOSE:
    return f'{Fore.GREEN}no problem in --> {address}'
    else:
    return None
    # NOTE: amazon denies requests from python scripts, so we use
    # a session with an updated 'User-Agent' throughout ('session')
    # amazon.com remembers if the session.get() was from a python agent,
    # and then denies the session.get(), even if it updated
    # its 'user-agent' with a 503 - Service Unavailable
    # OPTIMIZATION:
    # we try a light-weight session.head() call first;
    # if it fails for a domain w/ 405 (Method Not Allowed), then
    # we retry with a full session.get(), and log the location so
    # we always try the long way;
    resp = retrieve(address)
    if resp.status_code == 405:
    resp = session.get(address)
    FULL_GET.add(netloc)
    except Exception as e:
    return f'{Fore.YELLOW}{e} - {address}'
    pass

    def pattern_adjust(link_address):
    if resp.status_code in \
    [301, 308,
    400, 401, 402, 403, 404, 405, 408, 409, 410,
    501, 502, 503]:
    msg = f'{resp.status_code} - {resp.reason} => {address}'
    # TODO: scrub other permanent redirection codes to include in this:
    if resp.status_code == 301:
    newaddress = urljoin(address,
    resp.headers["Location"].split(";")[0])
    msg += f'\n NEW: => {newaddress}'
    elif VERBOSE:
    msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}'
    return msg


    def pattern_adjust(link_address, rbase=None):
    '''
    don't follow local addresses, unless specifically
    set option to follow internal addresses
    returns "adjusted" address and netloc;
    don't follow local addresses, unless
    option set to follow internal addresses
    '''
    if re.match('^#' ,link_address): # local
    return 0
    global website
    global INTERNAL

    if link_address[0] == '#': # local
    return (None, None)

    # create a local static var:
    # - depends on global "website"
    if 'rbase' not in pattern_adjust.__dict__:
    global website
    global INTERNAL
    pattern_adjust.rbase = urlsplit(website)
    # TODO: this has potential to break when
    # we loop over multiple URLs
    # if 'rbase' not in pattern_adjust.__dict__ \
    # pattern_adjust.rbase = urlsplit(website)

    r=urlsplit(link_address)
    r = urlsplit(link_address)
    # don't follow local:
    if not INTERNAL and \
    r.netloc == pattern_adjust.rbase.netloc:
    return 0
    r.netloc == rbase.netloc:
    return (None, None)
    # TODO: this could possibly follow local links
    if r.scheme=='' and (r.netloc!='' or r.path!=''):
    d=urlunsplit(r)
    if re.match('^//' ,d):
    m= re.search('(?<=//)\S+', d)
    d=m.group(0)
    return "https://"+d
    elif r.scheme=='' and r.netloc=='':
    return websit+link_address if INTERNAL else 0
    # NOTE: I don't really understand
    # what this is doing, so annotating:
    # if relative URL (local)
    # TODO: I am getting convinced what this wants to do
    # should be done w/ a simple urljoin()
    # I'm also thinking this code branch isn't traversed;
    if r.scheme == '' and (r.netloc != '' or r.path != ''):
    # reconstitute - it won't be a full path
    d = urlunsplit(r)
    # This if seems exceedingly wonky
    if d.startswith('//'):
    # if it starts with '//', throw that away...
    # m = re.search('(?<=//)\S+', d)
    # d = m.group(0)
    # TODO: if r.netloc is empty, then this
    # could result in an incorrect URL:
    # => if address = foo.com/something - then ok
    # => if address relaive: ./static/something - then trouble
    return ("https://" + d[2:], r.netloc)
    elif r.scheme == '' and r.netloc == '':
    # is this what I want to do?
    # would I rather do urljoin(urlunsplit(rbase), link_address)?
    return (urljoin(website, link_address), r.netloc) if INTERNAL else (None, None)
    else:
    return link_address
    return (link_address, r.netloc)


    def extract_link(address):
    global link_status
    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
    response=requests.get(address)
    # for key,value in iter(list(tags.items())):
    global session

    tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'}

    # the partitioned pieces of URL we're checking
    rbase = urlsplit(address)
    response = session.get(address)
    for key, value in tags.items():
    try:
    for link in BeautifulSoup(response.content,"html.parser",parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    p=pattern_adjust(link[value])
    if p and not p in link_status:
    link_status[p] = check(p)
    if link_status[p]:
    print(link_status[p])
    except Exception as e:
    print((e,address))


    if __name__=="__main__":
    for link in BeautifulSoup(response.content, "html.parser",
    parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    p, netloc = pattern_adjust(link[value], rbase)
    if p and p not in link_status:
    link_status[p] = check(p, netloc)
    if link_status[p]:
    print(link_status[p])


    if __name__ == "__main__":
    arguments = docopt(__doc__)
    INTERNAL = arguments['--internal']
    VERBOSE = arguments['--verbose']
    websites = arguments['<url>'] # "https://davericho.com/books/"

    colorama.init()
    colorama_init()
    link_status = {}
    website = arguments['<url>'] # "https://davericho.com/books/"
    result = extract_link(website)
    # sites which don't accept 'head' requests (dynamic)
    # populate with results of urlsplit().netloc
    FULL_GET = ('www.amazon.com',)
    session = requests.Session()
    # for places like amazon.com, which will deny python scripts:
    # Now - use session throughout this script!
    session.headers.update({'User-Agent': 'test'})
    for website in websites:
    print(f'{Fore.CYAN}--- checking links on {website} ---')
    result = extract_link(website)
  10. yarko renamed this gist Dec 27, 2017. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  11. yarko revised this gist Dec 27, 2017. 1 changed file with 87 additions and 58 deletions.
    145 changes: 87 additions & 58 deletions check_link.py
    100644 → 100755
    Original file line number Diff line number Diff line change
    @@ -1,73 +1,102 @@
    from bs4 import BeautifulSoup,SoupStrainer
    import urllib.request
    #!/usr/bin/env python
    '''
    check_link.py
    Parses a url, and checks links in it for validity.
    Normally, does not check links to the target URL.
    Normally, only reports problem links.
    Usage:
    check_link.py [-i] [-v] <url>
    Options:
    -i --internal check links internal to the site, also;
    -v --verbose report all link outcomes, good and bad;
    -h --help Show this message and exit;
    '''
    import requests
    from urllib.parse import *
    from docopt import docopt
    import colorama,re,queue,threading
    from colorama import Fore
    from urllib.parse import *
    from bs4 import BeautifulSoup,SoupStrainer

    class check_link():
    def __init__(self,address):
    self.address=address
    def check(self,address):
    try:
    req=urllib.request.Request(url=address)
    resp=urllib.request.urlopen(req)
    if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)
    else: print (Fore.GREEN+"no problem in-->"+address)

    except Exception as e:
    print (Fore.YELLOW+"{}-{}".format(e,address))
    pass
    def pattern_adjust(a):
    # this is a modification of a pretty terrible gist,
    # but it got me quickly started for now.
    # Original (I may delete this link eventually):
    # [email protected]:2872d7f994d192188970408980267e6e.git

    def check(address):
    global VERBOSE
    try:
    if re.match('^#' ,a):return 0
    r=urlsplit(a)
    if r.scheme=='' and (r.netloc!='' or r.path!=''):
    d=urlunsplit(r)
    if re.match('^//' ,d):
    m= re.search('(?<=//)\S+', d)
    d=m.group(0)
    m="https://"+d
    return m
    elif r.scheme=='' and r.netloc=='':
    return address+a
    else:return a
    resp=requests.head(address)
    if resp.status_code in [400,404,403,408,409,501,502,503]:
    return f'{Fore.RED}{resp.status_code} - {resp.reason} --> {address}'
    elif VERBOSE:
    return f'{Fore.GREEN}no problem in --> {address}'
    else:
    return None
    except Exception as e:
    return f'{Fore.YELLOW}{e} - {address}'
    pass

    def pattern_adjust(link_address):
    '''
    don't follow local addresses, unless specifically
    set option to follow internal addresses
    '''
    if re.match('^#' ,link_address): # local
    return 0

    # create a local static var:
    # - depends on global "website"
    if 'rbase' not in pattern_adjust.__dict__:
    global website
    global INTERNAL
    pattern_adjust.rbase = urlsplit(website)

    r=urlsplit(link_address)
    # don't follow local:
    if not INTERNAL and \
    r.netloc == pattern_adjust.rbase.netloc:
    return 0
    # TODO: this could possibly follow local links
    if r.scheme=='' and (r.netloc!='' or r.path!=''):
    d=urlunsplit(r)
    if re.match('^//' ,d):
    m= re.search('(?<=//)\S+', d)
    d=m.group(0)
    return "https://"+d
    elif r.scheme=='' and r.netloc=='':
    return websit+link_address if INTERNAL else 0
    else:
    return link_address


    def extract_link(address):
    global link_status
    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
    for key,value in iter(tags.items()):
    response=requests.get(address)
    # for key,value in iter(list(tags.items())):
    for key, value in tags.items():
    try:
    res=urllib.request.urlopen(address)
    response=res.read().decode('utf-8') #needs improvement
    for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)):
    for link in BeautifulSoup(response.content,"html.parser",parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    p=pattern_adjust(link[value])
    if p!=0 and str(p)!='None':
    newcheck=check_link(p)
    newcheck.check(p)
    if p not in hyperlinks:
    hyperlinks.add(p)
    if website.split('.')[1] in p:#needs improvement
    if not website.endswith(('.png','.jpeg','.js','jpg')):
    q.put(p)
    if p and not p in link_status:
    link_status[p] = check(p)
    if link_status[p]:
    print(link_status[p])
    except Exception as e:
    print (e,address)
    def threader():
    while True:
    value=q.get()
    result=extract_link(value)
    q.task_done()
    print((e,address))


    if __name__=="__main__":
    colorama.init()
    q=queue.Queue()
    global hyperlinks,website
    hyperlinks=set()
    website=input("Please enter the website address: ")
    for x in range(30):
    t=threading.Thread(target=threader)
    t.deamon=True
    t.start()
    q.put(website.strip())
    q.join()
    arguments = docopt(__doc__)
    INTERNAL = arguments['--internal']
    VERBOSE = arguments['--verbose']

    colorama.init()
    link_status = {}
    website = arguments['<url>'] # "https://davericho.com/books/"
    result = extract_link(website)
  12. @hackerdem hackerdem created this gist May 11, 2016.
    73 changes: 73 additions & 0 deletions check_link.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,73 @@
    from bs4 import BeautifulSoup,SoupStrainer
    import urllib.request
    import colorama,re,queue,threading
    from colorama import Fore
    from urllib.parse import *

    class check_link():
    def __init__(self,address):
    self.address=address
    def check(self,address):
    try:
    req=urllib.request.Request(url=address)
    resp=urllib.request.urlopen(req)
    if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)
    else: print (Fore.GREEN+"no problem in-->"+address)

    except Exception as e:
    print (Fore.YELLOW+"{}-{}".format(e,address))
    pass
    def pattern_adjust(a):
    try:
    if re.match('^#' ,a):return 0
    r=urlsplit(a)
    if r.scheme=='' and (r.netloc!='' or r.path!=''):
    d=urlunsplit(r)
    if re.match('^//' ,d):
    m= re.search('(?<=//)\S+', d)
    d=m.group(0)
    m="https://"+d
    return m
    elif r.scheme=='' and r.netloc=='':
    return address+a
    else:return a
    except Exception as e:
    pass
    def extract_link(address):
    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
    for key,value in iter(tags.items()):
    try:
    res=urllib.request.urlopen(address)
    response=res.read().decode('utf-8') #needs improvement
    for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    p=pattern_adjust(link[value])
    if p!=0 and str(p)!='None':
    newcheck=check_link(p)
    newcheck.check(p)
    if p not in hyperlinks:
    hyperlinks.add(p)
    if website.split('.')[1] in p:#needs improvement
    if not website.endswith(('.png','.jpeg','.js','jpg')):
    q.put(p)
    except Exception as e:
    print (e,address)
    def threader():
    while True:
    value=q.get()
    result=extract_link(value)
    q.task_done()

    if __name__=="__main__":
    colorama.init()
    q=queue.Queue()
    global hyperlinks,website
    hyperlinks=set()
    website=input("Please enter the website address: ")
    for x in range(30):
    t=threading.Thread(target=threader)
    t.deamon=True
    t.start()
    q.put(website.strip())
    q.join()