""" This script retrieves all proxy addresses from https://free-proxy-list.net (all pages). Proxy list can be used for ip rotation to get around bot protection. """ from selenium import webdriver from bs4 import BeautifulSoup class Proxy: def __init__(self, ip, code, type, https): self.ip = ip self.code = code self.type = type self.https = https class ProxyScrapper: def init(self): chromeOptions = webdriver.ChromeOptions() prefs = {'profile.managed_default_content_settings.images': 2, 'disk-cache-size': 4096} chromeOptions.add_experimental_option('prefs', prefs) chromeOptions.add_argument("--headless") self.driver = webdriver.Chrome(executable_path='drivers/chromedriver', options=chromeOptions) self.driver.implicitly_wait(10) def tear_down(self): self.driver.close() def scrape_proxies(self): home_url = 'https://free-proxy-list.net' self.driver.get(home_url) proxy_list = [] is_next_disabled = 'disabled' in self.driver.find_element_by_css_selector('#proxylisttable_next').get_attribute( "class") while not is_next_disabled: soup = BeautifulSoup(self.driver.page_source, 'html.parser') table = soup.findAll('tr', {'class': 'odd'}) + soup.findAll('tr', {'class': 'even'}) for row in table: data = row.findAll('td') proxy_list.append(Proxy(data[0].string + ':' + data[1].string, data[2], data[4], data[6])) next_btn = self.driver.find_element_by_css_selector('#proxylisttable_next>a') next_btn.click() is_next_disabled = 'disabled' in self.driver.find_element_by_css_selector('#proxylisttable_next').get_attribute( "class") return proxy_list def get_proxy_list(self): self.init() result = [] try: result = self.scrape_proxies() finally: self.tear_down() return result proxy_scrapper = ProxyScrapper() for proxy in proxy_scrapper.get_proxy_list(): print(proxy.ip)