Skip to content

Instantly share code, notes, and snippets.

@tevanraj
Created August 25, 2023 16:35
Show Gist options
  • Save tevanraj/a65cb27945291424f92b0af21ffe2421 to your computer and use it in GitHub Desktop.
Save tevanraj/a65cb27945291424f92b0af21ffe2421 to your computer and use it in GitHub Desktop.

Revisions

  1. tevanraj created this gist Aug 25, 2023.
    87 changes: 87 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,87 @@
    import os
    import time

    from tqdm import tqdm

    import pandas as pd
    import argparse

    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC

    from tools.helpers import get_text_by_selector
    from tools.setups import get_safe_setup
    from tools.loaders import load_config

    class Spider:
    def __init__(self, driver, config):
    self.__driver = driver
    self.__config = config

    def parse(self, url: str) -> pd.DataFrame:
    """
    Scrapes a website from url using predefined config, returns DataFrame
    parameters:
    url: string

    returns:
    pandas Dataframe
    """
    self.__driver.get(url)

    container_element = WebDriverWait(self.__driver, 5).until(
    EC.presence_of_element_located((By.CLASS_NAME, self.__config['container_class']))
    )

    items = self.__driver.find_elements_by_class_name(self.__config['items_class'])
    items_content = [
    [get_text_by_selector(div, selector) for selector in self.__config['data_selectors']]
    for div in items]
    return pd.DataFrame(items_content, columns = self.__config['data_column_titles'])

    def parse_pages(self, url: str):
    """
    Scrapes a website with pagination from url using predefined config, yields list of pandas DataFrames
    parameters:
    url: string
    """
    pagination_config = self.__config['pagination']

    for i in tqdm(range(1, pagination_config['crawl_pages'] + 1)):
    yield self.parse(url.replace("$p$", str(i)))

    time.sleep(int(pagination_config['delay']/1000))

    def scrape(args):
    config = load_config(args.config)

    pagination_config = config['pagination']
    url = config['url']

    driver = get_safe_setup()

    spider = Spider(driver, config)

    os.makedirs(os.path.dirname(args.output), exist_ok = True)

    try:
    if pagination_config['crawl_pages'] > 0:
    data = spider.parse_pages(url)
    df = pd.concat(list(data), axis = 0)
    else:
    df = spider.parse(url)

    df.to_csv(args.output, index = False)
    except Exception as e:
    print(f'Parsing failed due to {str(e)}')
    finally:
    driver.quit()

    if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', help='Configuration of spider learning')
    parser.add_argument('-o', '--output', help='Output file path')
    args = parser.parse_args()

    scrape(args)