Skip to content

Instantly share code, notes, and snippets.

@tevanraj
Created August 25, 2023 16:35
Show Gist options
  • Save tevanraj/a65cb27945291424f92b0af21ffe2421 to your computer and use it in GitHub Desktop.
Save tevanraj/a65cb27945291424f92b0af21ffe2421 to your computer and use it in GitHub Desktop.
Pycon-my-2023
import os
import time
from tqdm import tqdm
import pandas as pd
import argparse
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tools.helpers import get_text_by_selector
from tools.setups import get_safe_setup
from tools.loaders import load_config
class Spider:
def __init__(self, driver, config):
self.__driver = driver
self.__config = config
def parse(self, url: str) -> pd.DataFrame:
"""
Scrapes a website from url using predefined config, returns DataFrame
parameters:
url: string
returns:
pandas Dataframe
"""
self.__driver.get(url)
container_element = WebDriverWait(self.__driver, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, self.__config['container_class']))
)
items = self.__driver.find_elements_by_class_name(self.__config['items_class'])
items_content = [
[get_text_by_selector(div, selector) for selector in self.__config['data_selectors']]
for div in items]
return pd.DataFrame(items_content, columns = self.__config['data_column_titles'])
def parse_pages(self, url: str):
"""
Scrapes a website with pagination from url using predefined config, yields list of pandas DataFrames
parameters:
url: string
"""
pagination_config = self.__config['pagination']
for i in tqdm(range(1, pagination_config['crawl_pages'] + 1)):
yield self.parse(url.replace("$p$", str(i)))
time.sleep(int(pagination_config['delay']/1000))
def scrape(args):
config = load_config(args.config)
pagination_config = config['pagination']
url = config['url']
driver = get_safe_setup()
spider = Spider(driver, config)
os.makedirs(os.path.dirname(args.output), exist_ok = True)
try:
if pagination_config['crawl_pages'] > 0:
data = spider.parse_pages(url)
df = pd.concat(list(data), axis = 0)
else:
df = spider.parse(url)
df.to_csv(args.output, index = False)
except Exception as e:
print(f'Parsing failed due to {str(e)}')
finally:
driver.quit()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', help='Configuration of spider learning')
parser.add_argument('-o', '--output', help='Output file path')
args = parser.parse_args()
scrape(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment