Skip to content

Instantly share code, notes, and snippets.

@aleenprd
Last active October 24, 2022 18:11
Show Gist options
  • Select an option

  • Save aleenprd/63b736f9f6183e55a237f52afd86e12f to your computer and use it in GitHub Desktop.

Select an option

Save aleenprd/63b736f9f6183e55a237f52afd86e12f to your computer and use it in GitHub Desktop.

Revisions

  1. aleenprd renamed this gist Oct 24, 2022. 1 changed file with 3 additions and 23 deletions.
    26 changes: 3 additions & 23 deletions scraper_abstract_class.py → scraper_base_class.py
    Original file line number Diff line number Diff line change
    @@ -8,8 +8,8 @@ class ImdbScraperException(ScraperException):
    pass


    class Scraper(ABC):
    """Abstract class meant to be parent of various other scrapers.
    class Scraper():
    """Class meant to be parent of various other scrapers.
    Attributes:
    chromedriver (chromedriver): a Chrome webdriver for Selenium.
    @@ -38,24 +38,4 @@ def fetch_el_if_available(soup: BeautifulSoup, element_type: str, class_type: st
    if element is not None:
    element = element.text

    return element

    def make_soup_with_selenium(self, url: str) -> BeautifulSoup:
    """Return an HTML body from an URL.
    Args:
    url (str): string representation of a URL address.
    Returns:
    soup (BeautifulSoup): scraped webpage via bs4.
    """
    # You can either use a driver (write your one coad) like this
    # or install one (Chromedriver, Safari, etc.)
    self.chromedriver.maximize_window()
    self.chromedriver.get(url)
    sleep(5) # We want to give the page some time to load up

    page_source = self.chromedriver.page_source
    soup = BeautifulSoup(page_source, 'lxml')

    return soup
    return element
  2. aleenprd created this gist Oct 24, 2022.
    61 changes: 61 additions & 0 deletions scraper_abstract_class.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    class ScraperException(Exception):
    """Starting point for Scraper exceptions."""
    pass


    class ImdbScraperException(ScraperException):
    """Starting point for Scraper exceptions."""
    pass


    class Scraper(ABC):
    """Abstract class meant to be parent of various other scrapers.
    Attributes:
    chromedriver (chromedriver): a Chrome webdriver for Selenium.
    Methods:
    make_soup_with_selenium
    @staticmethod fetch_el_if_available
    """
    def __init__(self):
    driver_service = Service(ChromeDriverManager().install())
    self.chromedriver = webdriver.Chrome(service=driver_service)

    @staticmethod
    def fetch_el_if_available(soup: BeautifulSoup, element_type: str, class_type: str):
    """Returns element text if found, otherwise returns None.
    Args:
    soup (BeautifulSoup): a b24 soup.
    element_type (str): HTML type e.g. 'div'.
    class_type (str): the class of the desired element.
    Returns:
    element (str): text inside element.
    """
    element = soup.find(element_type, class_type)
    if element is not None:
    element = element.text

    return element

    def make_soup_with_selenium(self, url: str) -> BeautifulSoup:
    """Return an HTML body from an URL.
    Args:
    url (str): string representation of a URL address.
    Returns:
    soup (BeautifulSoup): scraped webpage via bs4.
    """
    # You can either use a driver (write your one coad) like this
    # or install one (Chromedriver, Safari, etc.)
    self.chromedriver.maximize_window()
    self.chromedriver.get(url)
    sleep(5) # We want to give the page some time to load up

    page_source = self.chromedriver.page_source
    soup = BeautifulSoup(page_source, 'lxml')

    return soup