jettify · August 9, 2016 20:50 · Jul 6, 2015 · Jul 6, 2015
diff --git a/OptimizedSparkInnerJoin → OptimizedSparkInnerJoin.scala b/OptimizedSparkInnerJoin → OptimizedSparkInnerJoin.scala
diff --git a/OptimizedSparkInnerJoin b/OptimizedSparkInnerJoin
@@ -0,0 +1,71 @@
+/** Hive/Pig/Cascading/Scalding-style inner join which will perform a map-side/replicated/broadcast
+  * join if the "small" relation has fewer than maxNumRows, and a reduce-side join otherwise. 
+  * @param big the large relation
+  * @param small the small relation
+  * @maxNumRows the maximum number of rows that the small relation can have to be a
+  *             candidate for a map-side/replicated/broadcast join
+  * @return a joined RDD with a common key and a tuple of values from the two
+  *         relations (the big relation value first, followed by the small one)
+  */
+private def optimizedInnerJoin[A : ClassTag, B : ClassTag, C : ClassTag]
+  (big: RDD[(A, B)], small: RDD[(A, C)], maxNumRows: Long): RDD[(A, (B, C))] = {
+
+  /* This is needed for efficiency's sake, since the choice between
+   * map- and reduce-side joins is based on the row count of the
+   * smaller relation. The count will materialize the small relation.
+   * If it's too big for a map-side join, it will be already cached
+   * for the reduce-side join. Caching is idempotent, so nothing
+   * will happen if the dataset is already cached.
+   */
+  small.cache()
+
+  val joined =
+    if (small.count() <= maxNumRows) {
+
+      /* There was another solution to this, i.e. "small.collectAsMap()"
+       * (http://ampcamp.berkeley.edu/wp-content/uploads/2012/06/matei-zaharia-amp-camp-2012-advanced-spark.pdf),
+       * but that gives incorrect results since the map deduplicates entries with identical keys,
+       * but that's a normal occurrence in MapReduce frameworks (that's the rationale for grouping
+       * entries by key in the reduce stage). The simpler solution gives incorrect results
+       * in these cases, which constitute the vast majority of key-value RDD use cases.
+       */
+      val grouped: Map[A, Array[C]] =
+        small.
+          collect().
+          groupBy {
+            case (key, _) => key
+          }.
+          map {
+            case (key, kv: Array[(A, C)]) =>
+              (key, kv.map { case (_, v) => v })
+          }
+
+      /* Broadcast the map representing the small relation to all nodes.
+       * Joining against the big dataset will be done locally on each node
+       * for all partitions at the map stage. This is called a map-side join
+       * in Hadoop-land, or a replicated join in distributed relational
+       * databases. In the Spark context, we can also call it a broadcast join.
+       */
+      val smallBc = sc.broadcast(grouped)
+
+      big.flatMap {
+
+        case (a: A, b: B) if smallBc.value.contains(a) =>
+          smallBc.value(a).flatMap {
+            case c => Some((a, (b, c)))
+          }
+
+        case _ => None
+
+      }
+
+    } else {
+
+      // "Small" dataset is too big - do a regular reduce-side join using the RDD API
+      big.join(small)
+
+    }
+
+  small.unpersist(blocking = false)
+  joined
+}