Skip to content

Instantly share code, notes, and snippets.

@joshyu
Forked from hackerdem/check_link.py
Created February 23, 2022 07:28
Show Gist options
  • Select an option

  • Save joshyu/4f1aaaf36eb27a998e8e030dec7aa1dc to your computer and use it in GitHub Desktop.

Select an option

Save joshyu/4f1aaaf36eb27a998e8e030dec7aa1dc to your computer and use it in GitHub Desktop.

Revisions

  1. @hackerdem hackerdem revised this gist Jul 31, 2020. 1 changed file with 6 additions and 4 deletions.
    10 changes: 6 additions & 4 deletions check_link.py
    Original file line number Diff line number Diff line change
    @@ -11,7 +11,8 @@ def check(self,address):
    try:
    req=urllib.request.Request(url=address)
    resp=urllib.request.urlopen(req)
    if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)
    if resp.status in [400,404,403,408,409,501,502,503]:
    print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)
    else: print (Fore.GREEN+"no problem in-->"+address)

    except Exception as e:
    @@ -37,10 +38,11 @@ def extract_link(address):
    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
    for key,value in iter(tags.items()):
    try:
    res=urllib.request.urlopen(address)
    headers={"User-Agent": "Mozilla/5.0"}
    res=urllib.request.urlopen(urllib.request.Request(url=address, headers=headers))
    response=res.read().decode('utf-8') #needs improvement
    for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    if link.has_attr(value) and address in link[value]: # address in link[value] to keep testing the target site only
    p=pattern_adjust(link[value])
    if p!=0 and str(p)!='None':
    newcheck=check_link(p)
    @@ -63,7 +65,7 @@ def threader():
    q=queue.Queue()
    global hyperlinks,website
    hyperlinks=set()
    website=input("Please enter the website address: ")
    website= 'https://www.sozcu.com.tr/' #Target website
    for x in range(30):
    t=threading.Thread(target=threader)
    t.deamon=True
  2. @hackerdem hackerdem revised this gist Jul 31, 2020. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions linkcheck.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    foo
  3. @hackerdem hackerdem created this gist May 11, 2016.
    73 changes: 73 additions & 0 deletions check_link.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,73 @@
    from bs4 import BeautifulSoup,SoupStrainer
    import urllib.request
    import colorama,re,queue,threading
    from colorama import Fore
    from urllib.parse import *

    class check_link():
    def __init__(self,address):
    self.address=address
    def check(self,address):
    try:
    req=urllib.request.Request(url=address)
    resp=urllib.request.urlopen(req)
    if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address)
    else: print (Fore.GREEN+"no problem in-->"+address)

    except Exception as e:
    print (Fore.YELLOW+"{}-{}".format(e,address))
    pass
    def pattern_adjust(a):
    try:
    if re.match('^#' ,a):return 0
    r=urlsplit(a)
    if r.scheme=='' and (r.netloc!='' or r.path!=''):
    d=urlunsplit(r)
    if re.match('^//' ,d):
    m= re.search('(?<=//)\S+', d)
    d=m.group(0)
    m="https://"+d
    return m
    elif r.scheme=='' and r.netloc=='':
    return address+a
    else:return a
    except Exception as e:
    pass
    def extract_link(address):
    tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' }
    for key,value in iter(tags.items()):
    try:
    res=urllib.request.urlopen(address)
    response=res.read().decode('utf-8') #needs improvement
    for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)):
    if link.has_attr(value):
    p=pattern_adjust(link[value])
    if p!=0 and str(p)!='None':
    newcheck=check_link(p)
    newcheck.check(p)
    if p not in hyperlinks:
    hyperlinks.add(p)
    if website.split('.')[1] in p:#needs improvement
    if not website.endswith(('.png','.jpeg','.js','jpg')):
    q.put(p)
    except Exception as e:
    print (e,address)
    def threader():
    while True:
    value=q.get()
    result=extract_link(value)
    q.task_done()

    if __name__=="__main__":
    colorama.init()
    q=queue.Queue()
    global hyperlinks,website
    hyperlinks=set()
    website=input("Please enter the website address: ")
    for x in range(30):
    t=threading.Thread(target=threader)
    t.deamon=True
    t.start()
    q.put(website.strip())
    q.join()