Skip to content

Instantly share code, notes, and snippets.

@nickthorpe
Created August 4, 2019 17:23
Show Gist options
  • Select an option

  • Save nickthorpe/f201bc9c7a9fcee2ccd11b08c7cbbefc to your computer and use it in GitHub Desktop.

Select an option

Save nickthorpe/f201bc9c7a9fcee2ccd11b08c7cbbefc to your computer and use it in GitHub Desktop.

Revisions

  1. nickthorpe created this gist Aug 4, 2019.
    19 changes: 19 additions & 0 deletions pandas_apply.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,19 @@
    from multiprocessing import Pool
    from functools import partial
    import numpy as np

    def parallelize(data, func, num_of_processes=8):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

    def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

    def parallelize_on_rows(data, func, num_of_processes=8):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

    # so df.apply(some_func, axis=1) becomes parallelize_on_rows(df, some_func)