Last active
August 24, 2021 07:42
-
-
Save pushpendrapratap/6d8c5517d2374915cd8bb05c158f0f17 to your computer and use it in GitHub Desktop.
Given a base_url ("https://food.grab.com/ph/en/restaurants"), capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude by intercepting grab-foods internal POST request
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| requirements.txt | |
| selenium==3.141.0 | |
| selenium-wire==4.4.1 | |
| """ | |
| import json | |
| from time import sleep | |
| from seleniumwire.webdriver import Firefox | |
| from selenium.webdriver.firefox.options import Options as firefox_options | |
| from selenium.webdriver.common.by import By | |
| from selenium.common.exceptions import TimeoutException | |
| from selenium.webdriver.support.wait import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| class Driver: | |
| """ | |
| driver setup: | |
| Although I'm using geckodriver; chromedriver, remotedriver, etc. can also be used | |
| """ | |
| def __init__(self, driver_path: str = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver") -> None: | |
| self.driver_path = driver_path | |
| self.browser = None | |
| self.setup() | |
| def setup(self): | |
| firefox_opts = firefox_options() | |
| firefox_opts.headless = True | |
| user_agent = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + | |
| 'Chrome/60.0.3112.50 Safari/537.36') | |
| firefox_opts.add_argument(f'user-agent={user_agent}') | |
| firefox_opts.add_argument('--no-sandbox') | |
| firefox_opts.add_argument("--disable-extensions") | |
| firefox_opts.add_argument('--disable-dev-shm-usage') | |
| options = { | |
| #'enable_har': True, | |
| #'disable_encoding': True, | |
| 'exclude_hosts': [ | |
| 'google-analytics.com', | |
| 'analytics.google.com', | |
| 'google.com', | |
| 'facebook.com', | |
| 'stats.g.doubleclick.net', | |
| ], | |
| #'suppress_connection_errors': True | |
| } | |
| self.browser = Firefox(executable_path=self.driver_path, desired_capabilities=firefox_opts.to_capabilities(), seleniumwire_options=options) | |
| def tear_down(self): | |
| self.browser.quit() | |
| class Scraper: | |
| """ | |
| Given a base_url, capture all restaurants (based on user's submitted location, e.g., Manila) latitude & longitude | |
| by intercepting grab-foods internal POST request. | |
| self.grab_internal_post_api is found by manually inspecting all XHR made my grab-foods, using chrome dev tools. | |
| """ | |
| def __init__(self, driver: Driver, base_url: str = "https://food.grab.com/ph/en/restaurants") -> None: | |
| self.driver = driver | |
| self.base_url = base_url | |
| self.grab_internal_post_api = "https://portal.grab.com/foodweb/v2/search" | |
| self._init_request() | |
| def _init_request(self): | |
| self.driver.browser.get(self.base_url) | |
| sleep(10) | |
| def load_more(self): | |
| del self.driver.browser.requests | |
| condition = EC.presence_of_element_located((By.XPATH,'//button[contains(@class, "ant-btn ant-btn-block")]')) | |
| more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition) | |
| print('more_results_button: ', more_results_button, '\n') | |
| more_results_button.click() | |
| sleep(10) | |
| page_num = 1 | |
| while more_results_button: | |
| try: | |
| print('page_num: ', page_num) | |
| more_results_button.click() | |
| more_results_button = WebDriverWait(self.driver.browser, 10, poll_frequency=1).until(condition) | |
| page_num += 1 | |
| sleep(10) | |
| except TimeoutException: | |
| print("No more LOAD MORE RESULTS button to be clicked!!!\n") | |
| break | |
| def capture_post_response(self): | |
| post_data = [] | |
| for r in self.driver.browser.iter_requests(): | |
| if r.method == 'POST' and r.url == self.grab_internal_post_api: | |
| # print(f"r.response.status_code: {r.response.status_code}, r.response.reason: {r.response.reason}") | |
| body = r.response.body.decode('utf-8') | |
| data = json.loads(body) | |
| post_data.append(data) | |
| return post_data | |
| def get_restaurant_latlng(self, post_data): | |
| d = {} | |
| for p in post_data: | |
| l = p['searchResult']['searchMerchants'] | |
| for rst in l: | |
| try: | |
| d[rst['chainID']] = {'chainName': rst['chainName'], 'latlng': rst['latlng']} | |
| except Exception as err: | |
| d[rst['address']['name']] = {'chainName': rst['address']['name'], 'latlng': rst['latlng']} | |
| # print(rst) | |
| # print(type(err), err) | |
| return d | |
| def scrape(self): | |
| self.load_more() | |
| post_data = self.capture_post_response() | |
| restaurants_latlng = self.get_restaurant_latlng(post_data) | |
| return restaurants_latlng | |
| def save(self, restaurants_latlng, file: str = 'grab_restaurants_latlng.json'): | |
| with open(file, 'w') as f: | |
| json.dump(restaurants_latlng, f, indent=4) | |
| if __name__ == "__main__": | |
| driver_path = "/home/Downloads/geckodriver-v0.29.1-linux64/geckodriver" | |
| base_url = "https://food.grab.com/ph/en/restaurants" | |
| driver = Driver(driver_path) | |
| scraper = Scraper(driver, base_url) | |
| restaurants_latlng = scraper.scrape() | |
| scraper.save(restaurants_latlng) | |
| driver.tear_down() |
contents of a single restaurant (i.e., rst):
{'id': '2-C2J3BF3DSA2YRX',
'address': {'name': 'Boodle Inasal x Happy Thirstday - Sampaloc [Available for LONG-DISTANCE DELIVERY]'},
'latlng': {'latitude': 14.6071574, 'longitude': 120.9876869},
'estimatedDeliveryTime': 36,
'merchantBrief': {'description': 'Prices are all VAT inclusive. Prices may also vary or be subject to change by the merchant.',
'cuisine': ['Filipino', 'Chicken', 'Casual Dining'],
'photoHref': 'https://d1sag4ddilekf6.azureedge.net/compressed/merchants/2-C2J3BF3DSA2YRX/hero/2a41ecb1608a4d03b36ab2638d553345_1618388597576346501.jpeg',
'smallPhotoHref': 'https://d1sag4ddilekf6.azureedge.net/compressed/merchants/2-C2J3BF3DSA2YRX/list/0cbd539669bb4632a14dd4ba28a47f5a_1612882355927116650.jpeg',
'isIntegrated': True,
'openHours': {'open': True,
'displayedHours': '00:00-23:59',
'sun': '00:00-23:59',
'mon': '00:00-23:59',
'tue': '00:00-23:59',
'wed': '00:00-23:59',
'thu': '00:00-23:59',
'fri': '00:00-23:59',
'sat': '00:00-23:59'},
'distanceInKm': 1.153,
'rating': 4.3,
'vote_count': 305,
'deliverBy': 'GRAB',
'displayInfo': {'primaryText': 'Boodle Inasal x Happy Thirstday - Sampaloc [Available for LONG-DISTANCE DELIVERY]'},
'deliverOptions': 'DELIVERY_TAKEAWAY'},
'chainID': 'BoodleInasalxHappyThirstday',
'chainName': 'Boodle Inasal x Happy Thirstday',
'metadata': {'origin': ['NonKeyword::', 'Search'],
'discovery': {'requestID': '02555e9e-e34c-4eda-98a5-e85cc21fb13f',
'service': 'foodsearch',
'method': 'nonKeywordSearch',
'discoverID': '21d1370b2b034c00a87a10fc6c0f6405',
'searchFeatures': 'flagMergeQueryResults=0',
'customDimensions': 'business_type=0,is_keyword=false',
'defaultRankGroupID': 'scope_defaultRankingFoodHomepage_exp_20210323_4_base_group_2021-08-20_28',
'defaultRankFormula': '0.2000 * [eta_score] + 0.0620 * [popularity_score] + 0.2460 * [promo_score] + 0.2460 * [partner_score] + 0.0620 * [recency_score] + 0.1850 * [is_integrated]'}},
'businessType': 'FOOD'}
info = self.driver.browser.find_element_by_xpath("//script[@id='__NEXT_DATA__']")
props_text = info.get_attribute('innerHTML')
props = json.loads(props_text)
intital_restaurant_dict = props['props']['initialReduxState']['pageRestaurantsV2']['entities']['restaurantList']
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
example (for 'Manila' location):