import requests from bs4 import BeautifulSoup import codecs import threading page = "" f = codecs.open("RERA_PUNJAB_LIST.html","r") page = f.read() soup = BeautifulSoup(page, 'html.parser') BROKERS = [] for tr in soup.find_all("tr"): tds = tr.find_all("td") sr = tds[0].get_text() name = tds[1].get_text() district = tds[2].get_text() rera = tds[3].get_text() registration_valid_upto = tds[4].get_text() anchors = tr.find_all("a") inputs = tr.find_all("input") type_ = anchors[0].get('id') agent_id = inputs[0].get("value") BROKERS.append({ "Sr. No.":sr.strip(), "Name":name.strip(), "District": district.strip(), "RERA No.":rera.strip(), "Registration Valid Upto": registration_valid_upto, "Offline": 1 if type_ in ['modalOpenerOfflineRegisteredButton',] else 0, "Agent ID": agent_id }) import csv csv_file = "PUNJAB_RERA_LIST.csv" try: with open(csv_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=BROKERS[0].keys()) writer.writeheader() for data in BROKERS: writer.writerow(data) except IOError: print("I/O error") import re import time class OnlineThread(threading.Thread): def __init__(self, name,): threading.Thread.__init__(self) self.name = name def run(self): ONLINE_BROKERS = [] ONLINE_KEYS = list(BROKERS[0].keys()) # ONLINE Registered CASE ONLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 0, BROKERS)) ONLINE_BROKERS_RAW_CNT = len(ONLINE_BROKERS_RAW) ONLINE_BROKERS_LOOP_COUNTER = 0 FAILED_ONLINES = [] for broker in ONLINE_BROKERS_RAW: AGENT_ID = broker['Agent ID'] ONLINE_BROKERS_LOOP_COUNTER += 1 print("Processing Online Broker:", ONLINE_BROKERS_LOOP_COUNTER,'/',ONLINE_BROKERS_RAW_CNT) URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewDetails?inAgent_ID="+AGENT_ID try: broker_page = requests.get(URL) except Exception as e: FAILED_ONLINES.append(broker) print("Failed Online Broker", broker, "Reason:", str(e)) time.sleep(5) continue content = broker_page.content broker_soup = BeautifulSoup(content, "html.parser") tds = broker_soup.find_all("td") for i in range(len(tds)-1): current_td = tds[i] next_td = tds[i + 1] if "single-detail" in current_td.get('class') if current_td.get("class") else False: key = current_td.get_text().strip() if key not in ONLINE_KEYS: ONLINE_KEYS.append(key) value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," ")) broker[key] = value ONLINE_BROKERS.append(broker) csv_file = "ONLINE_PUNJAB_RERA_LIST.csv" try: with open(csv_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=ONLINE_KEYS) writer.writeheader() for data in ONLINE_BROKERS: writer.writerow(data) except IOError: print("I/O error") print("FAILED ONLINE BROKERS", FAILED_ONLINES) class OfflineThread(threading.Thread): def __init__(self, name,): threading.Thread.__init__(self) self.name = name def run(self): OFFLINE_BROKERS = [] OFFLINE_KEYS = list(BROKERS[0].keys()) # OFFLINE Registered CASE OFFLINE_BROKERS_RAW = list(filter(lambda broker: broker['Offline'] == 1, BROKERS)) OFFLINE_BROKERS_RAW_CNT = len(OFFLINE_BROKERS_RAW) OFFLINE_BROKERS_LOOP_COUNTER = 0 FAILED_OFFLINES = [] for broker in OFFLINE_BROKERS_RAW: AGENT_ID = broker['Agent ID'] OFFLINE_BROKERS_LOOP_COUNTER += 1 URL = "https://rera.punjab.gov.in/reraindex/PublicView/AgentViewOfflineRegisteredDetails?inAgent_ID="+AGENT_ID print("Processing Offline Broker:", OFFLINE_BROKERS_LOOP_COUNTER,'/',OFFLINE_BROKERS_RAW_CNT, "URL:",URL) try: broker_page = requests.get(URL) except Exception as e: print("Failed offline Broker", broker, "Reason:", str(e)) FAILED_OFFLINES.append(broker) continue content = broker_page.content broker_soup = BeautifulSoup(content, "html.parser") tds = broker_soup.find_all("td") for i in range(len(tds)-1): current_td = tds[i] next_td = tds[i + 1] if "single-detail" in current_td.get('class') if current_td.get("class") else False: key = current_td.get_text().strip() if key not in OFFLINE_KEYS: OFFLINE_KEYS.append(key) value = re.sub(' +', ' ', next_td.get_text().strip().replace("\r"," ").replace("\n"," ")) broker[key] = value OFFLINE_BROKERS.append(broker) csv_file = "OFFLINE_PUNJAB_RERA_LIST.csv" try: with open(csv_file, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=OFFLINE_KEYS) writer.writeheader() for data in OFFLINE_BROKERS: writer.writerow(data) except IOError: print("I/O error") print("FAILED OFFLINE BROKERS", FAILED_OFFLINES) onlineThread = OnlineThread("Online") offlineThread = OfflineThread("Offline") onlineThread.start() offlineThread.start()