Skip to content

Instantly share code, notes, and snippets.

@tvinhkhoa
Forked from wooddar/multiprocess_selenium.py
Created January 29, 2023 17:57
Show Gist options
  • Save tvinhkhoa/3cff47dbc81b51c5ab0deaf6f1922456 to your computer and use it in GitHub Desktop.
Save tvinhkhoa/3cff47dbc81b51c5ab0deaf6f1922456 to your computer and use it in GitHub Desktop.

Revisions

  1. @wooddar wooddar revised this gist Nov 20, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion multiprocess_selenium.py
    Original file line number Diff line number Diff line change
    @@ -36,7 +36,7 @@ def selenium_task(worker, data):
    This is a demonstration selenium function that takes a worker and data and then does something with the worker and
    data.
    TODO: change the below code to be whatever it is you want your worker to do
    TODO: change the below code to be whatever it is you want your worker to do e.g. scrape webpages or run browser tests
    :param worker: A selenium web worker NOT a worker ID
    :type worker: webdriver.XXX
  2. @wooddar wooddar revised this gist Nov 19, 2018. No changes.
  3. @wooddar wooddar created this gist Nov 19, 2018.
    106 changes: 106 additions & 0 deletions multiprocess_selenium.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,106 @@
    """
    This is an adaptable example script for using selenium across multiple webbrowsers simultaneously. This makes use of
    two queues - one to store idle webworkers and another to store data to pass to any idle webworkers in a selenium function
    """
    from multiprocessing import Queue, cpu_count
    from threading import Thread
    from selenium import webdriver
    from time import sleep
    from numpy.random import randint
    import logging


    logger = logging.getLogger(__name__)

    # Some example data to pass the the selenium processes, this will just cause a sleep of time i
    # This data can be a list of any datatype that can be pickled
    selenium_data = [4, 2, 3, 3, 4, 3, 4, 3, 1, 2, 3, 2, 'STOP']

    # Create the two queues to hold the data and the IDs for the selenium workers
    selenium_data_queue = Queue()
    worker_queue = Queue()

    # Create Selenium processes and assign them a worker ID
    # This ID is what needs to be put on the queue as Selenium workers cannot be pickled
    # By default, make one selenium process per cpu core with cpu_count
    # TODO: Change the worker creation code to be your webworker of choice e.g. PhantomJS
    worker_ids = list(range(cpu_count()))
    selenium_workers = {i: webdriver.Chrome() for i in worker_ids}
    for worker_id in worker_ids:
    worker_queue.put(worker_id)


    def selenium_task(worker, data):
    """
    This is a demonstration selenium function that takes a worker and data and then does something with the worker and
    data.
    TODO: change the below code to be whatever it is you want your worker to do
    :param worker: A selenium web worker NOT a worker ID
    :type worker: webdriver.XXX
    :param data: Any data for your selenium function (must be pickleable)
    :rtype: None
    """
    worker.set_window_size(randint(100, 200), randint(200, 400))
    logger.info("Getting Google")
    worker.get(f'https://ytroulette.com')
    logger.info("Sleeping")
    sleep(data)


    def selenium_queue_listener(data_queue, worker_queue):
    """
    Monitor a data queue and assign new pieces of data to any available web workers to action
    :param data_queue: The python FIFO queue containing the data to run on the web worker
    :type data_queue: Queue
    :param worker_queue: The queue that holds the IDs of any idle workers
    :type worker_queue: Queue
    :rtype: None
    """
    logger.info("Selenium func worker started")
    while True:
    current_data = data_queue.get()
    if current_data == 'STOP':
    # If a stop is encountered then kill the current worker and put the stop back onto the queue
    # to poison other workers listening on the queue
    logger.warning("STOP encountered, killing worker thread")
    data_queue.put(current_data)
    break
    else:
    logger.info(f"Got the item {current_data} on the data queue")
    # Get the ID of any currently free workers from the worker queue
    worker_id = worker_queue.get()
    worker = selenium_workers[worker_id]
    # Assign current worker and current data to your selenium function
    selenium_task(worker, current_data)
    # Put the worker back into the worker queue as it has completed it's task
    worker_queue.put(worker_id)
    return


    # Create one new queue listener thread per selenium worker and start them
    logger.info("Starting selenium background processes")
    selenium_processes = [Thread(target=selenium_queue_listener,
    args=(selenium_data_queue, worker_queue)) for _ in worker_ids]
    for p in selenium_processes:
    p.daemon = True
    p.start()

    # Add each item of data to the data queue, this could be done over time so long as the selenium queue listening
    # processes are still running
    logger.info("Adding data to data queue")
    for d in selenium_data:
    selenium_data_queue.put(d)

    # Wait for all selenium queue listening processes to complete, this happens when the queue listener returns
    logger.info("Waiting for Queue listener threads to complete")
    for p in selenium_processes:
    p.join()

    # Quit all the web workers elegantly in the background
    logger.info("Tearing down web workers")
    for b in selenium_workers.values():
    b.quit()