InvisibleTech · August 29, 2015 14:14 · Feb 5, 2015 · Feb 5, 2015 · Feb 5, 2015
diff --git a/ApacheSparkLoadCsvFileHDFS2SQLContextAndTranspose b/ApacheSparkLoadCsvFileHDFS2SQLContextAndTranspose
@@ -1,7 +1,7 @@
 // Assuming the file is loaded into a localhost HDFS node:
 // hadoop fs -ls -R /
-// drwxr-xr-x   - johnferguson supergroup          0 2015-02-02 22:26 /spark
-// -rw-r--r--   1 johnferguson supergroup         78 2015-02-02 22:26 /spark/peopleall.txt
+// drwxr-xr-x   - xxxxxxxxxxxx supergroup          0 2015-02-02 22:26 /spark
+// -rw-r--r--   1 xxxxxxxxxxxx supergroup         78 2015-02-02 22:26 /spark/peopleall.txt
 //
 // All of this code is from 
 //     http://www.infoobjects.com/spark-sql-schemardd-programmatically-specifying-schema/

diff --git a/ApacheSparkLoadCsvFileHDFS2SQLContextAndTranspose b/ApacheSparkLoadCsvFileHDFS2SQLContextAndTranspose
@@ -3,7 +3,10 @@
 // drwxr-xr-x   - johnferguson supergroup          0 2015-02-02 22:26 /spark
 // -rw-r--r--   1 johnferguson supergroup         78 2015-02-02 22:26 /spark/peopleall.txt
 //
-// Much of this code is from http://www.infoobjects.com/spark-sql-schemardd-programmatically-specifying-schema/
+// All of this code is from 
+//     http://www.infoobjects.com/spark-sql-schemardd-programmatically-specifying-schema/
+//     https://github.com/bbnsumanth/transposing
+//
 // with some tweakd by me to run on my files and HDFS Node.
 //
 val sqlContext = new org.apache.spark.sql.SQLContext(sc)

diff --git a/ApacheSparkLoadCsvFileHDFS2SQLContextAndTranspose b/ApacheSparkLoadCsvFileHDFS2SQLContextAndTranspose
@@ -0,0 +1,29 @@
+// Assuming the file is loaded into a localhost HDFS node:
+// hadoop fs -ls -R /
+// drwxr-xr-x   - johnferguson supergroup          0 2015-02-02 22:26 /spark
+// -rw-r--r--   1 johnferguson supergroup         78 2015-02-02 22:26 /spark/peopleall.txt
+//
+// Much of this code is from http://www.infoobjects.com/spark-sql-schemardd-programmatically-specifying-schema/
+// with some tweakd by me to run on my files and HDFS Node.
+//
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+import org.apache.spark.sql._
+// Load the data and get a schema and temp table
+val person = sc.textFile("hdfs://localhost:9000/spark/peopleall.txt")
+val schema = StructType(Array(StructField("firstName",StringType,true),StructField("lastName",StringType,true),StructField("age",IntegerType,true)))
+val rowRDD = person.map(_.split(",")).map(p => org.apache.spark.sql.Row(p(0),p(1),p(2).toInt))
+val personSchemaRDD = sqlContext.applySchema(rowRDD, schema)
+
+personSchemaRDD.registerTempTable("person")
+
+// Spark SQL query
+sql("select * from person").foreach(println)
+
+// Now Transpose the data - based on the Git Hub repo: https://github.com/bbnsumanth/transposing
+//
+import org.apache.spark.sql.Row
+
+val rows = sql("select * from person")
+val transposed = rows.map(x => x.toArray).flatMap(x => x.zipWithIndex).map(x => x.swap).groupByKey.map(x => (x._1,x._2.toVector))
+