Skip to content

Instantly share code, notes, and snippets.

@charanjit-singh
Created March 10, 2021 13:54
Show Gist options
  • Select an option

  • Save charanjit-singh/44b792fd92c4ab3f709422589a7f08a9 to your computer and use it in GitHub Desktop.

Select an option

Save charanjit-singh/44b792fd92c4ab3f709422589a7f08a9 to your computer and use it in GitHub Desktop.

Revisions

  1. charanjit-singh created this gist Mar 10, 2021.
    153 changes: 153 additions & 0 deletions scrape.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,153 @@
    import requests
    from bs4 import BeautifulSoup
    import codecs
    import threading

    page = ""
    f = codecs.open("RERA_PUNJAB_LIST.html","r")
    page = f.read()
    soup = BeautifulSoup(page, 'html.parser')

    BROKERS = []

    for tr in soup.find_all("tr"):
    tds = tr.find_all("td")
    sr = tds[0].get_text()
    name = tds[1].get_text()
    district = tds[2].get_text()
    rera = tds[3].get_text()
    registration_valid_upto = tds[4].get_text()
    anchors = tr.find_all("a")
    inputs = tr.find_all("input")
    type_ = anchors[0].get('id')
    agent_id = inputs[0].get("value")
    BROKERS.append({
    "Sr. No.":sr.strip(),
    "Name":name.strip(),
    "District": district.strip(),
    "RERA No.":rera.strip(),
    "Registration Valid Upto": registration_valid_upto,
    "Offline": 1 if type_ in ['modalOpenerOfflineRegisteredButton',] else 0,
    "Agent ID": agent_id
    })

    import csv

    csv_file = "PUNJAB_RERA_LIST.csv"
    try:
    with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=BROKERS[0].keys())
    writer.writeheader()
    for data in BROKERS:
    writer.writerow(data)
    except IOError:
    print("I/O error")

    import re
    import time



    class OnlineThread(threading.Thread):
    def __init__(self, name,):
    threading.Thread.__init__(self)
    self.name = name

    def run(self):
    ONLINE_BROKERS = []
    ONLINE_KEYS = list(BROKERS[0].keys())
    # ONLINE Registered CASE
    ONLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 0, BROKERS))
    ONLINE_BROKERS_RAW_CNT = len(ONLINE_BROKERS_RAW)
    ONLINE_BROKERS_LOOP_COUNTER = 0
    FAILED_ONLINES = []
    for broker in ONLINE_BROKERS_RAW:
    AGENT_ID = broker['Agent ID']
    ONLINE_BROKERS_LOOP_COUNTER += 1
    print("Processing Online Broker:", ONLINE_BROKERS_LOOP_COUNTER,'/',ONLINE_BROKERS_RAW_CNT)
    URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewDetails?inAgent_ID="+AGENT_ID
    try:
    broker_page = requests.get(URL)
    except Exception as e:
    FAILED_ONLINES.append(broker)
    print("Failed Online Broker", broker, "Reason:", str(e))
    time.sleep(5)
    continue
    content = broker_page.content
    broker_soup = BeautifulSoup(content, "html.parser")
    tds = broker_soup.find_all("td")
    for i in range(len(tds)-1):
    current_td = tds[i]
    next_td = tds[i + 1]
    if "single-detail" in current_td.get('class') if current_td.get("class") else False:
    key = current_td.get_text().strip()
    if key not in ONLINE_KEYS:
    ONLINE_KEYS.append(key)
    value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," "))
    broker[key] = value
    ONLINE_BROKERS.append(broker)

    csv_file = "ONLINE_PUNJAB_RERA_LIST.csv"
    try:
    with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=ONLINE_KEYS)
    writer.writeheader()
    for data in ONLINE_BROKERS:
    writer.writerow(data)
    except IOError:
    print("I/O error")
    print("FAILED ONLINE BROKERS", FAILED_ONLINES)

    class OfflineThread(threading.Thread):
    def __init__(self, name,):
    threading.Thread.__init__(self)
    self.name = name

    def run(self):
    OFFLINE_BROKERS = []
    OFFLINE_KEYS = list(BROKERS[0].keys())
    # OFFLINE Registered CASE
    OFFLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 1, BROKERS))
    OFFLINE_BROKERS_RAW_CNT = len(OFFLINE_BROKERS_RAW)
    OFFLINE_BROKERS_LOOP_COUNTER = 0
    FAILED_OFFLINES = []
    for broker in OFFLINE_BROKERS_RAW:
    AGENT_ID = broker['Agent ID']
    OFFLINE_BROKERS_LOOP_COUNTER += 1
    URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewOfflineRegisteredDetails?inAgent_ID="+AGENT_ID
    print("Processing Offline Broker:", OFFLINE_BROKERS_LOOP_COUNTER,'/',OFFLINE_BROKERS_RAW_CNT, "URL:",URL)
    try:
    broker_page = requests.get(URL)
    except Exception as e:
    print("Failed offline Broker", broker, "Reason:", str(e))
    FAILED_OFFLINES.append(broker)
    continue
    content = broker_page.content
    broker_soup = BeautifulSoup(content, "html.parser")
    tds = broker_soup.find_all("td")
    for i in range(len(tds)-1):
    current_td = tds[i]
    next_td = tds[i + 1]
    if "single-detail" in current_td.get('class') if current_td.get("class") else False:
    key = current_td.get_text().strip()
    if key not in OFFLINE_KEYS:
    OFFLINE_KEYS.append(key)
    value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," "))
    broker[key] = value
    OFFLINE_BROKERS.append(broker)
    csv_file = "OFFLINE_PUNJAB_RERA_LIST.csv"
    try:
    with open(csv_file, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=OFFLINE_KEYS)
    writer.writeheader()
    for data in OFFLINE_BROKERS:
    writer.writerow(data)
    except IOError:
    print("I/O error")
    print("FAILED OFFLINE BROKERS", FAILED_OFFLINES)

    onlineThread = OnlineThread("Online")
    offlineThread = OfflineThread("Offline")

    onlineThread.start()
    offlineThread.start()