pdata = sqlContext.load("/home/rxin/ints.parquet").select("a", "num") sum_count = ( pdata.map(lambda x: (x.a, [x.num, 1])) .reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]) .collect()) [(x[0], float(x[1][0]) / x[1][1]) for x in sum_count]