Skip to content

Instantly share code, notes, and snippets.

@pushpendrapratap
Last active August 24, 2021 07:42
Show Gist options
  • Select an option

  • Save pushpendrapratap/6d8c5517d2374915cd8bb05c158f0f17 to your computer and use it in GitHub Desktop.

Select an option

Save pushpendrapratap/6d8c5517d2374915cd8bb05c158f0f17 to your computer and use it in GitHub Desktop.
Given a base_url ("https://food.grab.com/ph/en/restaurants"), capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude by intercepting grab-foods internal POST request
"""
requirements.txt
selenium==3.141.0
selenium-wire==4.4.1
"""
import json
from time import sleep
from seleniumwire.webdriver import Firefox
from selenium.webdriver.firefox.options import Options as firefox_options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class Driver:
"""
driver setup:
Although I'm using geckodriver; chromedriver, remotedriver, etc. can also be used
"""
def __init__(self, driver_path: str = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver") -> None:
self.driver_path = driver_path
self.browser = None
self.setup()
def setup(self):
firefox_opts = firefox_options()
firefox_opts.headless = True
user_agent = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/60.0.3112.50 Safari/537.36')
firefox_opts.add_argument(f'user-agent={user_agent}')
firefox_opts.add_argument('--no-sandbox')
firefox_opts.add_argument("--disable-extensions")
firefox_opts.add_argument('--disable-dev-shm-usage')
options = {
#'enable_har': True,
#'disable_encoding': True,
'exclude_hosts': [
'google-analytics.com',
'analytics.google.com',
'google.com',
'facebook.com',
'stats.g.doubleclick.net',
],
#'suppress_connection_errors': True
}
self.browser = Firefox(executable_path=self.driver_path, desired_capabilities=firefox_opts.to_capabilities(), seleniumwire_options=options)
def tear_down(self):
self.browser.quit()
class Scraper:
"""
Given a base_url, capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude
by intercepting grab-foods internal POST request.
self.grab_internal_post_api is found by manually inspecting all XHR made my grab-foods, using chrome dev tools.
"""
def __init__(self, driver: Driver, base_url: str = "https://food.grab.com/ph/en/restaurants") -> None:
self.driver = driver
self.base_url = base_url
self.grab_internal_post_api = "https://portal.grab.com/foodweb/v2/search"
self._init_request()
def _init_request(self):
self.driver.browser.get(self.base_url)
sleep(10)
def load_more(self):
del self.driver.browser.requests
condition = EC.presence_of_element_located((By.XPATH,'//button[contains(@class, "ant-btn ant-btn-block")]'))
more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition)
print('more_results_button: ', more_results_button, '\n')
more_results_button.click()
sleep(10)
page_num = 1
while more_results_button:
try:
print('page_num: ', page_num)
more_results_button.click()
more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition)
page_num += 1
sleep(10)
except TimeoutException:
print("No more LOAD MORE RESULTS button to be clicked!!!\n")
break
def capture_post_response(self):
post_data = []
for r in self.driver.browser.iter_requests():
if r.method == 'POST' and r.url == self.grab_internal_post_api:
# print(f"r.response.status_code: {r.response.status_code}, r.response.reason: {r.response.reason}")
body = r.response.body.decode('utf-8')
data = json.loads(body)
post_data.append(data)
return post_data
def get_restaurant_latlng(self, post_data):
d = {}
for p in post_data:
l = p['searchResult']['searchMerchants']
for rst in l:
try:
d[rst['chainID']] = {'chainName': rst['chainName'], 'latlng': rst['latlng']}
except Exception as err:
d[rst['address']['name']] = {'chainName': rst['address']['name'], 'latlng': rst['latlng']}
# print(rst)
# print(type(err), err)
return d
def scrape(self):
self.load_more()
post_data = self.capture_post_response()
restaurants_latlng = self.get_restaurant_latlng(post_data)
return restaurants_latlng
def save(self, restaurants_latlng, file: str = 'grab_restaurants_latlng.json'):
with open(file, 'w') as f:
json.dump(restaurants_latlng, f, indent=4)
if __name__ == "__main__":
driver_path = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver"
base_url = "https://food.grab.com/ph/en/restaurants"
driver = Driver(driver_path)
scraper = Scraper(driver, base_url)
restaurants_latlng = scraper.scrape()
scraper.save(restaurants_latlng)
driver.tear_down()
@pushpendrapratap
Copy link
Author

contents of a single restaurant (i.e., rst):

{'id': '2-C2J3BF3DSA2YRX',
 'address': {'name': 'Boodle Inasal x Happy Thirstday - Sampaloc [Available for LONG-DISTANCE DELIVERY]'},
 'latlng': {'latitude': 14.6071574, 'longitude': 120.9876869},
 'estimatedDeliveryTime': 36,
 'merchantBrief': {'description': 'Prices are all VAT inclusive. Prices may also vary or be subject to change by the merchant.',
  'cuisine': ['Filipino', 'Chicken', 'Casual Dining'],
  'photoHref': 'https://d1sag4ddilekf6.azureedge.net/compressed/merchants/2-C2J3BF3DSA2YRX/hero/2a41ecb1608a4d03b36ab2638d553345_1618388597576346501.jpeg',
  'smallPhotoHref': 'https://d1sag4ddilekf6.azureedge.net/compressed/merchants/2-C2J3BF3DSA2YRX/list/0cbd539669bb4632a14dd4ba28a47f5a_1612882355927116650.jpeg',
  'isIntegrated': True,
  'openHours': {'open': True,
   'displayedHours': '00:00-23:59',
   'sun': '00:00-23:59',
   'mon': '00:00-23:59',
   'tue': '00:00-23:59',
   'wed': '00:00-23:59',
   'thu': '00:00-23:59',
   'fri': '00:00-23:59',
   'sat': '00:00-23:59'},
  'distanceInKm': 1.153,
  'rating': 4.3,
  'vote_count': 305,
  'deliverBy': 'GRAB',
  'displayInfo': {'primaryText': 'Boodle Inasal x Happy Thirstday - Sampaloc [Available for LONG-DISTANCE DELIVERY]'},
  'deliverOptions': 'DELIVERY_TAKEAWAY'},
 'chainID': 'BoodleInasalxHappyThirstday',
 'chainName': 'Boodle Inasal x Happy Thirstday',
 'metadata': {'origin': ['NonKeyword::', 'Search'],
  'discovery': {'requestID': '02555e9e-e34c-4eda-98a5-e85cc21fb13f',
   'service': 'foodsearch',
   'method': 'nonKeywordSearch',
   'discoverID': '21d1370b2b034c00a87a10fc6c0f6405',
   'searchFeatures': 'flagMergeQueryResults=0',
   'customDimensions': 'business_type=0,is_keyword=false',
   'defaultRankGroupID': 'scope_defaultRankingFoodHomepage_exp_20210323_4_base_group_2021-08-20_28',
   'defaultRankFormula': '0.2000 * [eta_score] + 0.0620 * [popularity_score] + 0.2460 * [promo_score] + 0.2460 * [partner_score] + 0.0620 * [recency_score] + 0.1850 * [is_integrated]'}},
 'businessType': 'FOOD'}

@pushpendrapratap
Copy link
Author

info = self.driver.browser.find_element_by_xpath("//script[@id='__NEXT_DATA__']")
props_text = info.get_attribute('innerHTML')
props = json.loads(props_text)
intital_restaurant_dict = props['props']['initialReduxState']['pageRestaurantsV2']['entities']['restaurantList']

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment