import pandas as pd def _map_to_pandas(rdds): """ Needs to be here due to pickling issues """ return [pd.DataFrame(list(rdds))] def toPandas(df, n_partitions=None): """ Returns the contents of this `DataFrame` as Pandas `pandas.DataFrame` in a speedy fashion. The DataFrame is repartitioned if `n_partitions` is passed. :param df: pyspark.sql.DataFrame :param n_partitions: int or None :return: pandas.DataFrame """ if n_partitions is not None: df = df.repartition(n_partitions) df_pand = df.rdd.mapPartitions(_map_to_pandas).collect() df_pand = pd.concat(df_pand) df_pand.columns = df.columns return df_pand