Created
August 25, 2023 16:35
-
-
Save tevanraj/a65cb27945291424f92b0af21ffe2421 to your computer and use it in GitHub Desktop.
Pycon-my-2023
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import time | |
| from tqdm import tqdm | |
| import pandas as pd | |
| import argparse | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from tools.helpers import get_text_by_selector | |
| from tools.setups import get_safe_setup | |
| from tools.loaders import load_config | |
| class Spider: | |
| def __init__(self, driver, config): | |
| self.__driver = driver | |
| self.__config = config | |
| def parse(self, url: str) -> pd.DataFrame: | |
| """ | |
| Scrapes a website from url using predefined config, returns DataFrame | |
| parameters: | |
| url: string | |
| returns: | |
| pandas Dataframe | |
| """ | |
| self.__driver.get(url) | |
| container_element = WebDriverWait(self.__driver, 5).until( | |
| EC.presence_of_element_located((By.CLASS_NAME, self.__config['container_class'])) | |
| ) | |
| items = self.__driver.find_elements_by_class_name(self.__config['items_class']) | |
| items_content = [ | |
| [get_text_by_selector(div, selector) for selector in self.__config['data_selectors']] | |
| for div in items] | |
| return pd.DataFrame(items_content, columns = self.__config['data_column_titles']) | |
| def parse_pages(self, url: str): | |
| """ | |
| Scrapes a website with pagination from url using predefined config, yields list of pandas DataFrames | |
| parameters: | |
| url: string | |
| """ | |
| pagination_config = self.__config['pagination'] | |
| for i in tqdm(range(1, pagination_config['crawl_pages'] + 1)): | |
| yield self.parse(url.replace("$p$", str(i))) | |
| time.sleep(int(pagination_config['delay']/1000)) | |
| def scrape(args): | |
| config = load_config(args.config) | |
| pagination_config = config['pagination'] | |
| url = config['url'] | |
| driver = get_safe_setup() | |
| spider = Spider(driver, config) | |
| os.makedirs(os.path.dirname(args.output), exist_ok = True) | |
| try: | |
| if pagination_config['crawl_pages'] > 0: | |
| data = spider.parse_pages(url) | |
| df = pd.concat(list(data), axis = 0) | |
| else: | |
| df = spider.parse(url) | |
| df.to_csv(args.output, index = False) | |
| except Exception as e: | |
| print(f'Parsing failed due to {str(e)}') | |
| finally: | |
| driver.quit() | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-c', '--config', help='Configuration of spider learning') | |
| parser.add_argument('-o', '--output', help='Output file path') | |
| args = parser.parse_args() | |
| scrape(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment