import random from pyspark.sql import Row data = sc.parallelize(xrange(1000)).flatMap(lambda x: [Row(a=random.randint(1, 10), num=random.randint(1, 100), str=("a" * random.randint(1, 30))) for i in xrange(10000)]) dataTable = sqlContext.createDataFrame(data) dataTable.saveAsParquetFile("/home/rxin/ints.parquet")