----------------------------------------------------------------------------------------------------------- -- Script to migrate data from monthly tables into one table. -- filename: dynamotodynamo.hive -- author: leewc -- Instructions: -- Run: hive -hiveconf domain="Devo" -hiveconf region="NA" -hiveconf yyyymm=201604 -f dynamoTodynamo.hive -- Variables passed from bash are accessible via hiveconf. -- Variables are case-sensitive i.e test != Test ------------------------------------------------------------------------------------------------------------ -- start with a clean slate. DROP TABLE src_table_db; DROP TABLE dst_table_db; -- set time to live (expiration date for queries) in seconds. (one year in this example) set hivevar:ttl = 31536000; -- set read and write percentage capacities SET dynamodb.throughput.write.percent = 10; SET dynamodb.throughput.read.percent = 0.5; !echo "Read Write Capacity Percentages set."; !echo "USING DOMAIN: ${hiveconf:domain} REGION: ${hiveconf:region} "; !echo; -- note that in column mapping it's left side to right side: [ HIVE_COLUMN -> DDB_COLUMN ] CREATE EXTERNAL TABLE src_table_db (CustomerID string, DateTime string, URLClicked string, OrderID string, ProductID string) STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' TBLPROPERTIES ("dynamodb.table.name" = "CustomerTracker_{hiveconf:domain}_${hiveconf:region}_${hiveconf:yyyymm}", "dynamodb.column.mapping" = "CustomerID:CustomerID,DateTime:DateTime,URLClicked:URLClicked,OrderID:OrderID,ProductID:ProductID"); -- In the above table properties you can also perform mapping to columns that are differently named between Hive and DynamoDB, -- but in this example there's no need to. !echo "CREATED SOURCE TABLE."; !echo; CREATE EXTERNAL TABLE dst_table_db (CustomerID string, CreationDateTime bigint, ExpirationTimestamp bigint, OrderID string) STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' TBLPROPERTIES ("dynamodb.table.name" = "${hiveconf:domain}_${hiveconf:region}_AnniversaryPromo_2017", "dynamodb.column.mapping" = "CustomerID:CustomerID,CreationDateTime:CreationDateTime,ExpirationTimestamp:ExpirationTimestamp,OrderID:OrderID"); !echo "CREATED DESTINATION TABLE. Describing all tables.. "; !echo; DESCRIBE src_table_db; !echo; DESCRIBE dst_table_db; !echo "Starting the copy ... "; !echo "Using a Time to Live value of ${hivevar:ttl} for the expiration date."; !echo; -- Parse the CreationDateTime from Joda human time to epoch and also do this for the expiration with the additional TTL. --- We are effectively copying the data, then transforming it on the fly to get ExpirationTimeStamp and CreationTimeStamp in the right format, this will allow for TTL. --- We lose some accuracy when converting to epoch as the format stored is in Joda time and the built-in Hive functions do not support that. --- We can get around the formatting issue by loading a custom Joda time UDF, but it's more trouble than it's worth right now for this use case. See https://github.com/lithiumtech/yoda-time INSERT OVERWRITE TABLE dst_table_db SELECT CustomerID, unix_timestamp(StartDateTime, 'yyyy-MM-dd') as CreationTimeStamp, unix_timestamp(StartDateTime, 'yyyy-MM-dd') + ${hivevar:ttl} as ExpirationTimeStamp, OrderID FROM src_table_db WHERE length(OrderID) > 0; -- Optional: Add a WHERE length(CustomerID) > 0, to guard against any possibility of inserting a null value into the primary key -- of the destination field. I don't need to do this in this example as the customerID is the primary key in the source table -- and is impossible to have any null fields in the Primary key field of the source table to begin with. -- In addition to that, you can add other filters such as you want URL !echo "INSERT COMPLETE."; !echo "Backfill complete.";