Skip to content

Instantly share code, notes, and snippets.

@nickthorpe
Created August 4, 2019 17:23
Show Gist options
  • Select an option

  • Save nickthorpe/f201bc9c7a9fcee2ccd11b08c7cbbefc to your computer and use it in GitHub Desktop.

Select an option

Save nickthorpe/f201bc9c7a9fcee2ccd11b08c7cbbefc to your computer and use it in GitHub Desktop.
Parallelized Pandas Apply
from multiprocessing import Pool
from functools import partial
import numpy as np
def parallelize(data, func, num_of_processes=8):
data_split = np.array_split(data, num_of_processes)
pool = Pool(num_of_processes)
data = pd.concat(pool.map(func, data_split))
pool.close()
pool.join()
return data
def run_on_subset(func, data_subset):
return data_subset.apply(func, axis=1)
def parallelize_on_rows(data, func, num_of_processes=8):
return parallelize(data, partial(run_on_subset, func), num_of_processes)
# so df.apply(some_func, axis=1) becomes parallelize_on_rows(df, some_func)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment