Skip to content

Instantly share code, notes, and snippets.

@selvanponraj
Forked from AtlasPilotPuppy/hbase_rdd.scala
Created February 29, 2016 12:51
Show Gist options
  • Save selvanponraj/cf62dbdd70e4319f57f7 to your computer and use it in GitHub Desktop.
Save selvanponraj/cf62dbdd70e4319f57f7 to your computer and use it in GitHub Desktop.
Accessing Hbase from Apache Spark
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
val sc = new SparkContext("local", "Simple App")
val hbaseConfiguration = (tableName: String) => {
val hbaseConfiguration = HBaseConfiguration.create()
hbaseConfiguration.set(TableInputFormat.INPUT_TABLE, tableName)
hbaseConfiguration
}
val tableRDD = (table: String) => {
val rdd = new NewHadoopRDD(
sc,
classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result],
hbaseConfiguration(table)
)
rdd
}
val rdd = tableRDD("table-with-data")
/** Convert columns to strings **/
val columns = rdd.map(tuple => tuple._2).map(result => result.getColumn("Column Family".getBytes(),
"ColumnQualifier".getBytes())).map(keyValues => {
new String(keyValues.asScala.reduceLeft{
(a,b) => if (a.getTimestamp > b.getTimestamp) a else b
}.getValue.map(_.toChar))})
/** another way to get multiple columns */
val cols = rdd.map(tuple => tuple._2).map(result => result.getColumn("CF".getBytes, "CQ1".getBytes) :: result.getColumn("CF".getBytes, "CQ2".getBytes) :: result.getColumn("CF".getBytes, "CQ2".getBytes):: Nil)
// remove invalid items from rdd
val filtered = cols.filter( row.map(_.length > 0).reduce((acc, tip) => acc & tip) )
/** convert all values to strings **/
val row_vals = filtered.map(row => row.map(ele => new String(ele.head.getValue.map(_.toChar))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment