This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // XPath CheatSheet | |
| // To test XPath in your Chrome Debugger: $x('/html/body') | |
| // http://www.jittuu.com/2012/2/14/Testing-XPath-In-Chrome/ | |
| // 0. XPath Examples. | |
| // More: http://xpath.alephzarro.com/content/cheatsheet.html | |
| '//hr[@class="edge" and position()=1]' // every first hr of 'edge' class |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Check out our lab for practice: https://labs.itversity.com | |
| #Create RDD from file in HDFS | |
| orders = sc.textFile("/public/retail_db/orders") | |
| #Create RDD from local file (data from file -> collection -> RDD) | |
| productsList = open("/data/retail_db/products/part-00000").read().splitlines() | |
| productsRDD = sc.parallelize(productsList) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| daily_revenue_per_product_df.show(100) | |
| daily_revenue_per_product_df.save("/user/dgadiraju/daily_revenue_save", "json") | |
| daily_revenue_per_product_df.write.json("/user/dgadiraju/daily_revenue_write") | |
| daily_revenue_per_product_df.select("order_date", "daily_revenue_per_product").show() | |
| daily_revenue_per_product_df.filter(daily_revenue_per_product_df["order_date"] == "2013-07-26 00:00:00.0").show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| sqlContext.sql("CREATE DATABASE dgadiraju_daily_revenue"); sqlContext.sql("CREATE TABLE dgadiraju_daily_revenue.daily_revenue (order_date string, product_name string, daily_revenue_per_product float) STORED AS orc") | |
| daily_revenue_per_product_df = sqlContext.sql("SELECT o.order_date, p.product_name, sum(oi.order_item_subtotal) daily_revenue_per_product \ FROM orders o JOIN order_items oi \ ON o.order_id = oi.order_item_order_id \ JOIN products p \ ON p.product_id = oi.order_item_product_id \ WHERE o.order_status IN ('COMPLETE', 'CLOSED') \ GROUP BY o.order_date, p.product_name \ ORDER BY o.order_date, daily_revenue_per_product DESC") | |
| daily_revenue_per_product_df.insertInto("dgadiraju_daily_revenue.daily_revenue") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pyspark.sql import Row | |
| ordersRDD = sc.textFile("/public/retail_db/orders") | |
| ordersDF = ordersRDD.\ | |
| map(lambda o: Row(order_id=int(o.split(",")[0]), order_date=o.split(",")[1], order_customer_id=int(o.split(",")[2]), order_status=o.split(",")[3])).toDF() | |
| ordersDF.registerTempTable("ordersDF_table") | |
| sqlContext.sql("select order_status, count(1) from ordersDF_table group by order_status").show() | |
| sqlContext.sql("use dgadiraju_retail_db_txt") | |
| from pyspark.sql import Row | |
| productsRaw = open("/data/retail_db/products/part-00000").read().splitlines() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pyspark.sql import Row | |
| ordersRDD = sc.textFile("/public/retail_db/orders") | |
| ordersDF = ordersRDD.\ | |
| map(lambda o: Row(order_id=int(o.split(",")[0]), order_date=o.split(",")[1], order_customer_id=int(o.split(",")[2]), order_status=o.split(",")[3])).toDF() | |
| ordersDF.registerTempTable("ordersDF_table") | |
| sqlContext.sql("select order_status, count(1) from ordersDF_table group by order_status").show() | |
| sqlContext.sql("use dgadiraju_retail_db_txt") | |
| from pyspark.sql import Row | |
| productsRaw = open("/data/retail_db/products/part-00000").read().splitlines() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| select * from ( | |
| select o.order_id, o.order_date, o.order_status, oi.order_item_subtotal, | |
| round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) order_revenue, | |
| oi.order_item_subtotal/round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) pct_revenue, | |
| round(avg(oi.order_item_subtotal) over (partition by o.order_id), 2) avg_revenue, | |
| rank() over (partition by o.order_id order by oi.order_item_subtotal desc) rnk_revenue, | |
| dense_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) dense_rnk_revenue, | |
| percent_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) pct_rnk_revenue, | |
| row_number() over (partition by o.order_id order by oi.order_item_subtotal desc) rn_orderby_revenue, | |
| row_number() over (partition by o.order_id) rn_revenue, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| select * from ( | |
| select o.order_id, o.order_date, o.order_status, oi.order_item_subtotal, | |
| round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) order_revenue, | |
| oi.order_item_subtotal/round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) pct_revenue, | |
| round(avg(oi.order_item_subtotal) over (partition by o.order_id), 2) avg_revenue, | |
| rank() over (partition by o.order_id order by oi.order_item_subtotal desc) rnk_revenue, | |
| dense_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) dense_rnk_revenue, | |
| percent_rank() over (partition by o.order_id order by oi.order_item_subtotal desc) pct_rnk_revenue, | |
| row_number() over (partition by o.order_id order by oi.order_item_subtotal desc) rn_orderby_revenue, | |
| row_number() over (partition by o.order_id) rn_revenue |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| select * from ( | |
| select o.order_id, o.order_date, o.order_status, oi.order_item_subtotal, | |
| round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) order_revenue, | |
| oi.order_item_subtotal/round(sum(oi.order_item_subtotal) over (partition by o.order_id), 2) pct_revenue, | |
| round(avg(oi.order_item_subtotal) over (partition by o.order_id), 2) avg_revenue | |
| from orders o join order_items oi | |
| on o.order_id = oi.order_item_order_id | |
| where o.order_status in ('COMPLETE', 'CLOSED')) q | |
| where order_revenue >= 1000 | |
| order by order_date, order_revenue desc, rank_revenue; |