Skip to content

Instantly share code, notes, and snippets.

@edwintyh
Created February 2, 2022 04:08
Show Gist options
  • Save edwintyh/e866ebdcaf732bb8636dc7aa382bf027 to your computer and use it in GitHub Desktop.
Save edwintyh/e866ebdcaf732bb8636dc7aa382bf027 to your computer and use it in GitHub Desktop.
def info(self, show = True):
'''
Print concise summary of a pyspark.sql.DataFrame
This method prints information about a DataFrame
including the index dtype and columns, non-null values
Args:
show(bool): default True. show result
Returns:
pyspark.sql.DataFrame
'''
subset = self.schema.names
total_rows = self.count()
_non_null = \
self.select([(total_rows - f.sum(f.when(f.col(col).isNull(),1).otherwise(0))).alias(col) for col in subset])\
.toPandas()\
.transpose()\
.reset_index()\
.rename(columns={'index':'Column', 0:'Non-Null Count'})
_non_null = spark.createDataFrame(_non_null)
_dtype = spark.createDataFrame(self.dtypes).withColumnRenamed('_1','Column').withColumnRenamed('_2','Dtype')
result = _dtype.join(_non_null, on = 'Column').select('Column', 'Non-Null Count', 'Dtype')
if show:
return result.show()
else:
return result
pyspark.sql.DataFrame.info = info
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment