Skip to content

Instantly share code, notes, and snippets.

@edwintyh
Created February 2, 2022 04:08
Show Gist options
  • Save edwintyh/e866ebdcaf732bb8636dc7aa382bf027 to your computer and use it in GitHub Desktop.
Save edwintyh/e866ebdcaf732bb8636dc7aa382bf027 to your computer and use it in GitHub Desktop.

Revisions

  1. edwintyh created this gist Feb 2, 2022.
    34 changes: 34 additions & 0 deletions pyspark_info.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,34 @@
    def info(self, show = True):

    '''
    Print concise summary of a pyspark.sql.DataFrame
    This method prints information about a DataFrame
    including the index dtype and columns, non-null values
    Args:
    show(bool): default True. show result
    Returns:
    pyspark.sql.DataFrame
    '''

    subset = self.schema.names
    total_rows = self.count()
    _non_null = \
    self.select([(total_rows - f.sum(f.when(f.col(col).isNull(),1).otherwise(0))).alias(col) for col in subset])\
    .toPandas()\
    .transpose()\
    .reset_index()\
    .rename(columns={'index':'Column', 0:'Non-Null Count'})
    _non_null = spark.createDataFrame(_non_null)
    _dtype = spark.createDataFrame(self.dtypes).withColumnRenamed('_1','Column').withColumnRenamed('_2','Dtype')
    result = _dtype.join(_non_null, on = 'Column').select('Column', 'Non-Null Count', 'Dtype')

    if show:
    return result.show()
    else:
    return result

    pyspark.sql.DataFrame.info = info