Skip to content

Instantly share code, notes, and snippets.

@pushpendrapratap
Last active August 24, 2021 07:42
Show Gist options
  • Select an option

  • Save pushpendrapratap/6d8c5517d2374915cd8bb05c158f0f17 to your computer and use it in GitHub Desktop.

Select an option

Save pushpendrapratap/6d8c5517d2374915cd8bb05c158f0f17 to your computer and use it in GitHub Desktop.
Given a base_url ("https://food.grab.com/ph/en/restaurants"), capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude by intercepting grab-foods internal POST request
"""
requirements.txt
selenium==3.141.0
selenium-wire==4.4.1
"""
import json
from time import sleep
from seleniumwire.webdriver import Firefox
from selenium.webdriver.firefox.options import Options as firefox_options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class Driver:
"""
driver setup:
Although I'm using geckodriver; chromedriver, remotedriver, etc. can also be used
"""
def __init__(self, driver_path: str = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver") -> None:
self.driver_path = driver_path
self.browser = None
self.setup()
def setup(self):
firefox_opts = firefox_options()
firefox_opts.headless = True
user_agent = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/60.0.3112.50 Safari/537.36')
firefox_opts.add_argument(f'user-agent={user_agent}')
firefox_opts.add_argument('--no-sandbox')
firefox_opts.add_argument("--disable-extensions")
firefox_opts.add_argument('--disable-dev-shm-usage')
options = {
#'enable_har': True,
#'disable_encoding': True,
'exclude_hosts': [
'google-analytics.com',
'analytics.google.com',
'google.com',
'facebook.com',
'stats.g.doubleclick.net',
],
#'suppress_connection_errors': True
}
self.browser = Firefox(executable_path=self.driver_path, desired_capabilities=firefox_opts.to_capabilities(), seleniumwire_options=options)
def tear_down(self):
self.browser.quit()
class Scraper:
"""
Given a base_url, capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude
by intercepting grab-foods internal POST request.
self.grab_internal_post_api is found by manually inspecting all XHR made my grab-foods, using chrome dev tools.
"""
def __init__(self, driver: Driver, base_url: str = "https://food.grab.com/ph/en/restaurants") -> None:
self.driver = driver
self.base_url = base_url
self.grab_internal_post_api = "https://portal.grab.com/foodweb/v2/search"
self._init_request()
def _init_request(self):
self.driver.browser.get(self.base_url)
sleep(10)
def load_more(self):
del self.driver.browser.requests
condition = EC.presence_of_element_located((By.XPATH,'//button[contains(@class, "ant-btn ant-btn-block")]'))
more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition)
print('more_results_button: ', more_results_button, '\n')
more_results_button.click()
sleep(10)
page_num = 1
while more_results_button:
try:
print('page_num: ', page_num)
more_results_button.click()
more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition)
page_num += 1
sleep(10)
except TimeoutException:
print("No more LOAD MORE RESULTS button to be clicked!!!\n")
break
def capture_post_response(self):
post_data = []
for r in self.driver.browser.iter_requests():
if r.method == 'POST' and r.url == self.grab_internal_post_api:
# print(f"r.response.status_code: {r.response.status_code}, r.response.reason: {r.response.reason}")
body = r.response.body.decode('utf-8')
data = json.loads(body)
post_data.append(data)
return post_data
def get_restaurant_latlng(self, post_data):
d = {}
for p in post_data:
l = p['searchResult']['searchMerchants']
for rst in l:
try:
d[rst['chainID']] = {'chainName': rst['chainName'], 'latlng': rst['latlng']}
except Exception as err:
d[rst['address']['name']] = {'chainName': rst['address']['name'], 'latlng': rst['latlng']}
# print(rst)
# print(type(err), err)
return d
def scrape(self):
self.load_more()
post_data = self.capture_post_response()
restaurants_latlng = self.get_restaurant_latlng(post_data)
return restaurants_latlng
def save(self, restaurants_latlng, file: str = 'grab_restaurants_latlng.json'):
with open(file, 'w') as f:
json.dump(restaurants_latlng, f, indent=4)
if __name__ == "__main__":
driver_path = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver"
base_url = "https://food.grab.com/ph/en/restaurants"
driver = Driver(driver_path)
scraper = Scraper(driver, base_url)
restaurants_latlng = scraper.scrape()
scraper.save(restaurants_latlng)
driver.tear_down()
@pushpendrapratap
Copy link
Author

info = self.driver.browser.find_element_by_xpath("//script[@id='__NEXT_DATA__']")
props_text = info.get_attribute('innerHTML')
props = json.loads(props_text)
intital_restaurant_dict = props['props']['initialReduxState']['pageRestaurantsV2']['entities']['restaurantList']

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment