dat-vikash · February 27, 2018 12:52 · Apr 8, 2016 · Apr 8, 2016 · Mar 22, 2016
diff --git a/faster_toPandas.py b/faster_toPandas.py
@@ -6,7 +6,7 @@ def _map_to_pandas(rdds):
 
 def toPandas(df, n_partitions=None):
     """
-    Returns the contents of this `DataFrame` as Pandas `pandas.DataFrame` in a speedy fashion. The DataFrame is
+    Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
     repartitioned if `n_partitions` is passed.
     :param df:              pyspark.sql.DataFrame
     :param n_partitions:    int or None

diff --git a/faster_toPandas.py b/faster_toPandas.py
@@ -12,12 +12,8 @@ def toPandas(df, n_partitions=None):
     :param n_partitions:    int or None
     :return:                pandas.DataFrame
     """
-
-    if n_partitions is not None:
-        df = df.repartition(n_partitions)
-
+    if n_partitions is not None: df = df.repartition(n_partitions)
     df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
     df_pand = pd.concat(df_pand)
     df_pand.columns = df.columns
-
     return df_pand
diff --git a/faster_toPandas.py b/faster_toPandas.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+def _map_to_pandas(rdds):
+    """ Needs to be here due to pickling issues """
+    return [pd.DataFrame(list(rdds))]
+
+def toPandas(df, n_partitions=None):
+    """
+    Returns the contents of this `DataFrame` as Pandas `pandas.DataFrame` in a speedy fashion. The DataFrame is
+    repartitioned if `n_partitions` is passed.
+    :param df:              pyspark.sql.DataFrame
+    :param n_partitions:    int or None
+    :return:                pandas.DataFrame
+    """
+
+    if n_partitions is not None:
+        df = df.repartition(n_partitions)
+
+    df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
+    df_pand = pd.concat(df_pand)
+    df_pand.columns = df.columns
+
+    return df_pand
No results found