edwintyh · February 2, 2022 04:08 · Feb 2, 2022
diff --git a/pyspark_info.py b/pyspark_info.py
@@ -0,0 +1,34 @@
+def info(self, show = True):
+
+  '''
+  Print concise summary of a pyspark.sql.DataFrame
+  This method prints information about a DataFrame
+  including the index dtype and columns, non-null values
+  
+  Args:
+    show(bool): default True. show result
+    
+    
+  Returns:
+    pyspark.sql.DataFrame
+  
+  '''
+
+  subset = self.schema.names
+  total_rows = self.count()
+  _non_null = \
+    self.select([(total_rows - f.sum(f.when(f.col(col).isNull(),1).otherwise(0))).alias(col) for col in subset])\
+    .toPandas()\
+    .transpose()\
+    .reset_index()\
+    .rename(columns={'index':'Column', 0:'Non-Null Count'})
+  _non_null = spark.createDataFrame(_non_null)
+  _dtype = spark.createDataFrame(self.dtypes).withColumnRenamed('_1','Column').withColumnRenamed('_2','Dtype')
+  result = _dtype.join(_non_null, on = 'Column').select('Column', 'Non-Null Count', 'Dtype')
+
+  if show:
+    return result.show()
+  else:
+    return result
+
+pyspark.sql.DataFrame.info = info