InvisibleTech · February 5, 2016 01:34 · Feb 5, 2016
diff --git a/joiner.scala b/joiner.scala
@@ -0,0 +1,22 @@
+// Need to :paste this into Spark Shell to see it work.
+//
+
+// Load up the columns
+val alpha = sc.parallelize(List("a", "b", "c", "d"))
+val nums = sc.parallelize(List(1, 2, 3, 4))
+
+// Key them by index
+val alphaK = alpha.zipWithIndex.map(t => (t._2, t._1))
+val numsK = nums.zipWithIndex.map(t => (t._2, t._1))
+
+// Join them - which gives you (k (v1, v2))
+val joined = alphaK.join(numsK)
+
+// Join the join again - adding a third column of duplicate data - (k ((V1, v2), v3))
+val dupes =joined.join(alphaK)
+
+// Okay - now flattent the tuples to a list - there may be a better way
+val flatter = dupes.map(t => (t._1, t._2._1.productIterator.toList ++ List(t._2._2)))
+
+// Take out the key - now all you have are three column rows
+flatter.map(_._2).collect
diff --git a/sampleoutput.sh b/sampleoutput.sh
@@ -0,0 +1,7 @@
+scala> dupes.collect
+res54: Array[(Long, ((String, Int), String))] = Array((0,((a,1),a)), (1,((b,2),b)), (2,((c,3),c)), (3,((d,4),d)))
+
+scala> flatter.map(_._2).collect
+res55: Array[List[Any]] = Array(List(a, 1, a), List(b, 2, b), List(c, 3, c), List(d, 4, d))
+
+scala>