""" requirements.txt selenium==3.141.0 selenium-wire==4.4.1 """ import json from time import sleep from seleniumwire.webdriver import Firefox from selenium.webdriver.firefox.options import Options as firefox_options from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC class Driver: """ driver setup: Although I'm using geckodriver; chromedriver, remotedriver, etc. can also be used """ def __init__(self, driver_path: str = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver") -> None: self.driver_path = driver_path self.browser = None self.setup() def setup(self): firefox_opts = firefox_options() firefox_opts.headless = True user_agent = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/60.0.3112.50 Safari/537.36') firefox_opts.add_argument(f'user-agent={user_agent}') firefox_opts.add_argument('--no-sandbox') firefox_opts.add_argument("--disable-extensions") firefox_opts.add_argument('--disable-dev-shm-usage') options = { #'enable_har': True, #'disable_encoding': True, 'exclude_hosts': [ 'google-analytics.com', 'analytics.google.com', 'google.com', 'facebook.com', 'stats.g.doubleclick.net', ], #'suppress_connection_errors': True } self.browser = Firefox(executable_path=self.driver_path, desired_capabilities=firefox_opts.to_capabilities(), seleniumwire_options=options) def tear_down(self): self.browser.quit() class Scraper: """ Given a base_url, capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude by intercepting grab-foods internal POST request. self.grab_internal_post_api is found by manually inspecting all XHR made my grab-foods, using chrome dev tools. """ def __init__(self, driver: Driver, base_url: str = "https://food.grab.com/ph/en/restaurants") -> None: self.driver = driver self.base_url = base_url self.grab_internal_post_api = "https://portal.grab.com/foodweb/v2/search" self._init_request() def _init_request(self): self.driver.browser.get(self.base_url) sleep(10) def load_more(self): del self.driver.browser.requests condition = EC.presence_of_element_located((By.XPATH,'//button[contains(@class, "ant-btn ant-btn-block")]')) more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition) print('more_results_button: ', more_results_button, '\n') more_results_button.click() sleep(10) page_num = 1 while more_results_button: try: print('page_num: ', page_num) more_results_button.click() more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition) page_num += 1 sleep(10) except TimeoutException: print("No more LOAD MORE RESULTS button to be clicked!!!\n") break def capture_post_response(self): post_data = [] for r in self.driver.browser.iter_requests(): if r.method == 'POST' and r.url == self.grab_internal_post_api: # print(f"r.response.status_code: {r.response.status_code}, r.response.reason: {r.response.reason}") body = r.response.body.decode('utf-8') data = json.loads(body) post_data.append(data) return post_data def get_restaurant_latlng(self, post_data): d = {} for p in post_data: l = p['searchResult']['searchMerchants'] for rst in l: try: d[rst['chainID']] = {'chainName': rst['chainName'], 'latlng': rst['latlng']} except Exception as err: d[rst['address']['name']] = {'chainName': rst['address']['name'], 'latlng': rst['latlng']} # print(rst) # print(type(err), err) return d def scrape(self): self.load_more() post_data = self.capture_post_response() restaurants_latlng = self.get_restaurant_latlng(post_data) return restaurants_latlng def save(self, restaurants_latlng, file: str = 'grab_restaurants_latlng.json'): with open(file, 'w') as f: json.dump(restaurants_latlng, f, indent=4) if __name__ == "__main__": driver_path = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver" base_url = "https://food.grab.com/ph/en/restaurants" driver = Driver(driver_path) scraper = Scraper(driver, base_url) restaurants_latlng = scraper.scrape() scraper.save(restaurants_latlng) driver.tear_down()