-----------------------------------------------------------------------------------------------------------
-- Script to migrate data from monthly tables into one table.
--  filename: dynamotodynamo.hive
--  author: leewc
--  Instructions: 
--    Run: hive -hiveconf domain="Devo" -hiveconf region="NA" -hiveconf yyyymm=201604 -f dynamoTodynamo.hive
--    Variables passed from bash are accessible via hiveconf.
--    Variables are case-sensitive i.e test != Test
------------------------------------------------------------------------------------------------------------

-- start with a clean slate.

DROP TABLE src_table_db;
DROP TABLE dst_table_db;

-- set time to live (expiration date for queries) in seconds. (one year in this example)
set hivevar:ttl = 31536000;

-- set read and write percentage capacities
SET dynamodb.throughput.write.percent = 10;
SET dynamodb.throughput.read.percent = 0.5;

!echo "Read Write Capacity Percentages set.";
!echo "USING DOMAIN: ${hiveconf:domain} REGION: ${hiveconf:region} ";
!echo;

-- note that in column mapping it's left side to right side: [ HIVE_COLUMN -> DDB_COLUMN ]

CREATE EXTERNAL TABLE src_table_db (CustomerID string, DateTime string, URLClicked string, OrderID string, ProductID string) 
STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
TBLPROPERTIES ("dynamodb.table.name" = "CustomerTracker_{hiveconf:domain}_${hiveconf:region}_${hiveconf:yyyymm}",
	       "dynamodb.column.mapping" = "CustomerID:CustomerID,DateTime:DateTime,URLClicked:URLClicked,OrderID:OrderID,ProductID:ProductID");

-- In the above table properties you can also perform mapping to columns that are differently named between Hive and DynamoDB,
--  but in this example there's no need to.

!echo "CREATED SOURCE TABLE.";
!echo;

CREATE EXTERNAL TABLE dst_table_db (CustomerID string, CreationDateTime bigint, ExpirationTimestamp bigint, OrderID string)
STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
TBLPROPERTIES ("dynamodb.table.name" = "${hiveconf:domain}_${hiveconf:region}_AnniversaryPromo_2017",
	       "dynamodb.column.mapping" = "CustomerID:CustomerID,CreationDateTime:CreationDateTime,ExpirationTimestamp:ExpirationTimestamp,OrderID:OrderID");


!echo "CREATED DESTINATION TABLE. Describing all tables.. ";
!echo;

DESCRIBE src_table_db;
!echo;
DESCRIBE dst_table_db;

!echo "Starting the copy ... ";
!echo "Using a Time to Live value of ${hivevar:ttl} for the expiration date.";
!echo; 

-- Parse the CreationDateTime from Joda human time to epoch and also do this for the expiration with the additional TTL.
--- We are effectively copying the data, then transforming it on the fly to get ExpirationTimeStamp and CreationTimeStamp in the right format, this will allow for TTL.
--- We lose some accuracy when converting to epoch as the format stored is in Joda time and the built-in Hive functions do not support that. 
--- We can get around the formatting issue by loading a custom Joda time UDF, but it's more trouble than it's worth right now for this use case. See https://github.com/lithiumtech/yoda-time

INSERT OVERWRITE TABLE dst_table_db 
SELECT CustomerID, 
       unix_timestamp(StartDateTime, 'yyyy-MM-dd') as CreationTimeStamp,
       unix_timestamp(StartDateTime, 'yyyy-MM-dd') + ${hivevar:ttl} as ExpirationTimeStamp,
       OrderID
FROM src_table_db
WHERE length(OrderID) > 0;

-- Optional: Add a WHERE length(CustomerID) > 0, to guard against any possibility of inserting a null value into the primary key
-- of the destination field. I don't need to do this in this example as the customerID is the primary key in the source table
-- and is impossible to have any null fields in the Primary key field of the source table to begin with.
-- In addition to that, you can add other filters such as you want URL

!echo "INSERT COMPLETE.";
!echo "Backfill complete.";