Last active
March 30, 2021 02:47
-
-
Save vishwanath79/67ef098022f545fe71246ce8975f9bb8 to your computer and use it in GitHub Desktop.
Revisions
-
vishwanath79 revised this gist
Jul 12, 2020 . 1 changed file with 81 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -42,7 +42,14 @@ { "cell_type": "code", "source": [ "from pyspark.sql.functions import lit\n", "# Add some cosmetic colums to simulate synthetic data\n", "user_df = user_df.withColumn(\n", " \"product\",\n", " lit(\"coffee\")\n", " ).withColumn(\n", " \"behavior\",lit(\"normal\")\n", " )\n" ], "metadata": {}, "outputs": [ @@ -61,7 +68,8 @@ { "cell_type": "code", "source": [ "# Raw data\n", "user_df.show(5)" ], "metadata": {}, "outputs": [ @@ -80,7 +88,12 @@ { "cell_type": "code", "source": [ "\n", "#Some data cleanup steps to aggregate by visits and sums since Faker gives you some duplicate dates\n", "\n", "user_df = user_df.groupby('dates','product','behavior','visits').agg({\"amount\":'max'})\n", "user_df = user_df.groupby('dates','product','behavior','max(amount)').agg({\"visits\":'max'}).withColumnRenamed(\"max(visits)\",\"visits\").withColumnRenamed(\"max(amount)\",\"amount\" )\n", "\n" ], "metadata": {}, "outputs": [ @@ -99,7 +112,9 @@ { "cell_type": "code", "source": [ "#set condition for annotating customer behavior i.e. Regular (0) when amount spent > 150 else a non-regular (1)\n", "from pyspark.sql.functions import col, when \n", "user_df = user_df.withColumn(\"behavior\", when(col('amount') > 150 , 0).otherwise( 1))\n" ], "metadata": {}, "outputs": [ @@ -137,7 +152,8 @@ { "cell_type": "code", "source": [ "#Validate count of data , since we had 1000 records initially\n", "user_df.groupBy('behavior').count().show()" ], "metadata": {}, "outputs": [ @@ -156,7 +172,8 @@ { "cell_type": "code", "source": [ "#drop columns we dont need like the date column since we are not doing a time series analysis here only anamoly detection\n", "user_df = user_df.drop('dates')\n" ], "metadata": {}, "outputs": [ @@ -175,7 +192,9 @@ { "cell_type": "code", "source": [ "regular_records = user_df['behavior'] == 0\n", "irregular_records = user_df['behavior'] == 1\n", "# Classify the customer behavior into integers (0 and 1). Note we want to train the datapoint only with the regular values so any anamoly is detected when a devation from the trained data is experienced.\n" ], "metadata": {}, "outputs": [ @@ -194,7 +213,9 @@ { "cell_type": "code", "source": [ "#lets split the datasets into regular and non regular customers\n", "user_df_regular = user_df[regular_records] # only regular values\n", "user_df_irregular = user_df[irregular_records] # only irregular data points\n" ], "metadata": {}, "outputs": [ @@ -213,7 +234,16 @@ { "cell_type": "code", "source": [ "#Drop the columsn we dont need from these datasets\n", "user_df_regular = user_df_regular.drop('behavior','product')\n", "user_df_irregular = user_df_irregular.drop('behavior','product')\n", "# Ran into some issues loading the Spark dataframes into the keras tensors, was easier to just convert to pandas and let it do its thing\n", "user_df_regular = user_df_regular.select(\"*\").toPandas()\n", "user_df_irregular = user_df_irregular.select(\"*\").toPandas()\n", "#we want to feed only the values of the array into the neural net not column headers \n", "x_val_regular = user_df_regular.values\n", "x_val_irregular = user_df_irregular.values\n", "x_val_regular,x_val_irregular" ], "metadata": {}, "outputs": [ @@ -232,7 +262,10 @@ { "cell_type": "code", "source": [ "# split the regular customer data into test and training set\n", "from sklearn.model_selection import train_test_split\n", "x_regular_train, x_regular_test = train_test_split(\n", " x_val_regular, test_size=0.25, random_state=42)\n" ], "metadata": {}, "outputs": [ @@ -251,7 +284,8 @@ { "cell_type": "code", "source": [ "print(f\"Regular customers train count: {len(x_regular_train)} \\n\")\n", "print(f\"Regular customers test count: {len(x_regular_test)} \\n\") # going to be held out for test validation, note these add up to 750 i.e. validates our original counts" ], "metadata": {}, "outputs": [ @@ -270,7 +304,20 @@ { "cell_type": "code", "source": [ "\n", "from sklearn import metrics\n", "import numpy as np\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense\n", "#Define a sequential model and use a dense layer as input with a recitified linear unit activation function \n", "model = Sequential()\n", "model.add(Dense(10, input_dim=x_val_regular.shape[1], activation='relu'))\n", "model.add(Dense(3, activation='relu'))\n", "model.add(Dense(10, activation='relu'))\n", "\n", "model.add(Dense(x_val_regular.shape[1])) \n", "model.compile(loss='mean_squared_error', optimizer='adam') # Run standard mean square error\n", "model.summary()" ], "metadata": {}, "outputs": [ @@ -289,7 +336,14 @@ { "cell_type": "code", "source": [ "import tensorflow as tf\n", "\n", "from keras.callbacks import EarlyStopping\n", "#Implement early stopping to save on time\n", "save_early_callback = EarlyStopping(monitor='loss', min_delta=0,\n", " patience=3, verbose=1,\n", " restore_best_weights=True)\n", "model.fit(x_regular_train,x_regular_train,verbose=1,epochs=100, callbacks=[save_early_callback])" ], "metadata": {}, "outputs": [ @@ -308,7 +362,20 @@ { "cell_type": "code", "source": [ "# Check prediction against the regular customer validation test data set\n", "prediction = model.predict(x_regular_test)\n", "cust_test_score = np.sqrt(metrics.mean_squared_error(prediction,x_regular_test))\n", "print(f\"Customer Test Score : {cust_test_score} \\n\")\n", "# Check prediction against the regular customer validation data set\n", "prediction = model.predict(x_val_regular)\n", "cust_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_regular))\n", "print(f\"Customer Validation Score : {cust_validation_score} \\n\")\n", "# Check prediction against the irregular customer validation data set\n", "prediction = model.predict(x_val_irregular)\n", "cust_irreg_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_irregular))\n", "# massive reconstruction error will manifest here run on the entire dataset\n", "print(f\"Irregular Customer Validation Score : {cust_irreg_validation_score} \\n\")\n", "\n" ], "metadata": {}, "outputs": [ -
vishwanath79 revised this gist
Jul 12, 2020 . 1 changed file with 334 additions and 1 deletion.There are no files selected for viewing
-
vishwanath79 revised this gist
Jul 12, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
-
vishwanath79 revised this gist
Jul 12, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
-
vishwanath79 revised this gist
Jul 12, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
-
vishwanath79 revised this gist
Jul 12, 2020 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +0,0 @@ -
vishwanath79 created this gist
Jul 12, 2020 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ README This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ README