Skip to content

Instantly share code, notes, and snippets.

// XPath CheatSheet
// To test XPath in your Chrome Debugger: $x('/html/body')
// http://www.jittuu.com/2012/2/14/Testing-XPath-In-Chrome/
// 0. XPath Examples.
// More: http://xpath.alephzarro.com/content/cheatsheet.html
'//hr[@class="edge" and position()=1]' // every first hr of 'edge' class
#Check out our lab for practice: https://labs.itversity.com
#Create RDD from file in HDFS
orders = sc.textFile("/public/retail_db/orders")
#Create RDD from local file (data from file -> collection -> RDD)
productsList = open("/data/retail_db/products/part-00000").read().splitlines()
productsRDD = sc.parallelize(productsList)
daily_revenue_per_product_df.show(100)
daily_revenue_per_product_df.save("/user/dgadiraju/daily_revenue_save", "json")
daily_revenue_per_product_df.write.json("/user/dgadiraju/daily_revenue_write")
daily_revenue_per_product_df.select("order_date", "daily_revenue_per_product").show()
daily_revenue_per_product_df.filter(daily_revenue_per_product_df["order_date"] == "2013-07-26 00:00:00.0").show()
sqlContext.sql("CREATE DATABASE dgadiraju_daily_revenue"); sqlContext.sql("CREATE TABLE dgadiraju_daily_revenue.daily_revenue (order_date string, product_name string, daily_revenue_per_product float) STORED AS orc")
daily_revenue_per_product_df = sqlContext.sql("SELECT o.order_date, p.product_name, sum(oi.order_item_subtotal) daily_revenue_per_product \ FROM orders o JOIN order_items oi \ ON o.order_id = oi.order_item_order_id \ JOIN products p \ ON p.product_id = oi.order_item_product_id \ WHERE o.order_status IN ('COMPLETE', 'CLOSED') \ GROUP BY o.order_date, p.product_name \ ORDER BY o.order_date, daily_revenue_per_product DESC")
daily_revenue_per_product_df.insertInto("dgadiraju_daily_revenue.daily_revenue")
from pyspark.sql import Row
ordersRDD = sc.textFile("/public/retail_db/orders")
ordersDF = ordersRDD.\
map(lambda o: Row(order_id=int(o.split(",")[0]), order_date=o.split(",")[1], order_customer_id=int(o.split(",")[2]), order_status=o.split(",")[3])).toDF()
ordersDF.registerTempTable("ordersDF_table")
sqlContext.sql("select order_status, count(1) from ordersDF_table group by order_status").show()
sqlContext.sql("use dgadiraju_retail_db_txt")
from pyspark.sql import Row
productsRaw = open("/data/retail_db/products/part-00000").read().splitlines()
from pyspark.sql import Row
ordersRDD = sc.textFile("/public/retail_db/orders")
ordersDF = ordersRDD.\
map(lambda o: Row(order_id=int(o.split(",")[0]), order_date=o.split(",")[1], order_customer_id=int(o.split(",")[2]), order_status=o.split(",")[3])).toDF()
ordersDF.registerTempTable("ordersDF_table")
sqlContext.sql("select order_status, count(1) from ordersDF_table group by order_status").show()
sqlContext.sql("use dgadiraju_retail_db_txt")
from pyspark.sql import Row
productsRaw = open("/data/retail_db/products/part-00000").read().splitlines()
select * from (
select o.order_id, o.order_date, o.order_status, oi.order_item_subtotal,
round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) order_revenue,
oi.order_item_subtotal/round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) pct_revenue,
round(avg(oi.order_item_subtotal) over (partition by o.order_id), 2) avg_revenue,
rank() over (partition by o.order_id order by oi.order_item_subtotal desc) rnk_revenue,
dense_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) dense_rnk_revenue,
percent_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) pct_rnk_revenue,
row_number() over (partition by o.order_id order by oi.order_item_subtotal desc) rn_orderby_revenue,
row_number() over (partition by o.order_id) rn_revenue,
select * from (
select o.order_id, o.order_date, o.order_status, oi.order_item_subtotal,
round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) order_revenue,
oi.order_item_subtotal/round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) pct_revenue,
round(avg(oi.order_item_subtotal) over (partition by o.order_id), 2) avg_revenue,
rank() over (partition by o.order_id order by oi.order_item_subtotal desc) rnk_revenue,
dense_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) dense_rnk_revenue,
percent_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) pct_rnk_revenue,
row_number() over (partition by o.order_id order by oi.order_item_subtotal desc) rn_orderby_revenue,
row_number() over (partition by o.order_id) rn_revenue
select * from (
select o.order_id, o.order_date, o.order_status, oi.order_item_subtotal,
round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) order_revenue,
oi.order_item_subtotal/round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) pct_revenue,
round(avg(oi.order_item_subtotal) over (partition by o.order_id), 2) avg_revenue
from orders o join order_items oi
on o.order_id = oi.order_item_order_id
where o.order_status in ('COMPLETE', 'CLOSED')) q
where order_revenue >= 1000
order by order_date, order_revenue desc, rank_revenue;