vishwanath79 · March 30, 2021 02:47 · Jul 12, 2020 · Jul 12, 2020 · Jul 12, 2020 · Jul 12, 2020
diff --git a/autoencoder_anomaly_detection.ipynb b/autoencoder_anomaly_detection.ipynb
@@ -42,7 +42,14 @@
   {
    "cell_type": "code",
    "source": [
-    "from pyspark.sql.functions import lit\n# Add some cosmetic colums to simulate synthetic data\nuser_df = user_df.withColumn(\n    \"product\",\n    lit(\"coffee\")\n ).withColumn(\n     \"behavior\",lit(\"normal\")\n )\n"
+    "from pyspark.sql.functions import lit\n",
+    "# Add some cosmetic colums to simulate synthetic data\n",
+    "user_df = user_df.withColumn(\n",
+    "    \"product\",\n",
+    "    lit(\"coffee\")\n",
+    " ).withColumn(\n",
+    "     \"behavior\",lit(\"normal\")\n",
+    " )\n"
    ],
    "metadata": {},
    "outputs": [
@@ -61,7 +68,8 @@
   {
    "cell_type": "code",
    "source": [
-    "# Raw data\nuser_df.show(5)"
+    "# Raw data\n",
+    "user_df.show(5)"
    ],
    "metadata": {},
    "outputs": [
@@ -80,7 +88,12 @@
   {
    "cell_type": "code",
    "source": [
-    "\n#Some data cleanup steps to aggregate by visits and sums since Faker gives you some duplicate dates\n\nuser_df = user_df.groupby('dates','product','behavior','visits').agg({\"amount\":'max'})\nuser_df = user_df.groupby('dates','product','behavior','max(amount)').agg({\"visits\":'max'}).withColumnRenamed(\"max(visits)\",\"visits\").withColumnRenamed(\"max(amount)\",\"amount\" )\n\n"
+    "\n",
+    "#Some data cleanup steps to aggregate by visits and sums since Faker gives you some duplicate dates\n",
+    "\n",
+    "user_df = user_df.groupby('dates','product','behavior','visits').agg({\"amount\":'max'})\n",
+    "user_df = user_df.groupby('dates','product','behavior','max(amount)').agg({\"visits\":'max'}).withColumnRenamed(\"max(visits)\",\"visits\").withColumnRenamed(\"max(amount)\",\"amount\" )\n",
+    "\n"
    ],
    "metadata": {},
    "outputs": [
@@ -99,7 +112,9 @@
   {
    "cell_type": "code",
    "source": [
-    "#set condition for annotating customer behavior i.e. Regular (0) when amount spent > 150 else a non-regular (1)\nfrom pyspark.sql.functions import col, when \nuser_df = user_df.withColumn(\"behavior\", when(col('amount') > 150 ,  0).otherwise( 1))\n"
+    "#set condition for annotating customer behavior i.e. Regular (0) when amount spent > 150 else a non-regular (1)\n",
+    "from pyspark.sql.functions import col, when \n",
+    "user_df = user_df.withColumn(\"behavior\", when(col('amount') > 150 ,  0).otherwise( 1))\n"
    ],
    "metadata": {},
    "outputs": [
@@ -137,7 +152,8 @@
   {
    "cell_type": "code",
    "source": [
-    "#Validate count of data , since we had 1000 records initially\nuser_df.groupBy('behavior').count().show()"
+    "#Validate count of data , since we had 1000 records initially\n",
+    "user_df.groupBy('behavior').count().show()"
    ],
    "metadata": {},
    "outputs": [
@@ -156,7 +172,8 @@
   {
    "cell_type": "code",
    "source": [
-    "#drop columns we dont need like the date column since we are not doing a time series analysis here only anamoly detection\nuser_df = user_df.drop('dates')\n"
+    "#drop columns we dont need like the date column since we are not doing a time series analysis here only anamoly detection\n",
+    "user_df = user_df.drop('dates')\n"
    ],
    "metadata": {},
    "outputs": [
@@ -175,7 +192,9 @@
   {
    "cell_type": "code",
    "source": [
-    "good_records = user_df['behavior'] == 0\nbad_records = user_df['behavior'] == 1\n# Classify the customer behavior into integers (0 and 1). Note we want to train the datapoint only with the good values so any anamoly is detected when a devation from the trained data is experienced.\n"
+    "regular_records = user_df['behavior'] == 0\n",
+    "irregular_records = user_df['behavior'] == 1\n",
+    "# Classify the customer behavior into integers (0 and 1). Note we want to train the datapoint only with the regular values so any anamoly is detected when a devation from the trained data is experienced.\n"
    ],
    "metadata": {},
    "outputs": [
@@ -194,7 +213,9 @@
   {
    "cell_type": "code",
    "source": [
-    "#lets split the datasets into regular and non regular customers\nuser_df_good = user_df[good_records] # only good values\nuser_df_bad = user_df[bad_records] # only bad data points\n"
+    "#lets split the datasets into regular and non regular customers\n",
+    "user_df_regular = user_df[regular_records] # only regular values\n",
+    "user_df_irregular = user_df[irregular_records] # only irregular data points\n"
    ],
    "metadata": {},
    "outputs": [
@@ -213,7 +234,16 @@
   {
    "cell_type": "code",
    "source": [
-    "#Drop the columsn we dont need from these datasets\nuser_df_good = user_df_good.drop('behavior','product')\nuser_df_bad = user_df_bad.drop('behavior','product')\n# Ran into some issues loading the Spark dataframes into the keras tensors, was easier to just convert to pandas and let it do its thing\nuser_df_good = user_df_good.select(\"*\").toPandas()\nuser_df_bad = user_df_bad.select(\"*\").toPandas()\n#we want to feed only the values of the array into the neural net not column headers \nx_val_good = user_df_good.values\nx_val_bad = user_df_bad.values\nx_val_good,x_val_bad"
+    "#Drop the columsn we dont need from these datasets\n",
+    "user_df_regular = user_df_regular.drop('behavior','product')\n",
+    "user_df_irregular = user_df_irregular.drop('behavior','product')\n",
+    "# Ran into some issues loading the Spark dataframes into the keras tensors, was easier to just convert to pandas and let it do its thing\n",
+    "user_df_regular = user_df_regular.select(\"*\").toPandas()\n",
+    "user_df_irregular = user_df_irregular.select(\"*\").toPandas()\n",
+    "#we want to feed only the values of the array into the neural net not column headers \n",
+    "x_val_regular = user_df_regular.values\n",
+    "x_val_irregular = user_df_irregular.values\n",
+    "x_val_regular,x_val_irregular"
    ],
    "metadata": {},
    "outputs": [
@@ -232,7 +262,10 @@
   {
    "cell_type": "code",
    "source": [
-    "# split the regular customer data into test and training set\nfrom sklearn.model_selection import train_test_split\nx_good_train, x_good_test = train_test_split(\n        x_val_good, test_size=0.25, random_state=42)\n"
+    "# split the regular customer data into test and training set\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "x_regular_train, x_regular_test = train_test_split(\n",
+    "        x_val_regular, test_size=0.25, random_state=42)\n"
    ],
    "metadata": {},
    "outputs": [
@@ -251,7 +284,8 @@
   {
    "cell_type": "code",
    "source": [
-    "print(f\"Regular customers train count: {len(x_good_train)} \\n\")\nprint(f\"Regular customers test count: {len(x_good_test)} \\n\") # going to be held out for test validation, note these add up to 750 i.e. validates our original counts"
+    "print(f\"Regular customers train count: {len(x_regular_train)} \\n\")\n",
+    "print(f\"Regular customers test count: {len(x_regular_test)} \\n\") # going to be held out for test validation, note these add up to 750 i.e. validates our original counts"
    ],
    "metadata": {},
    "outputs": [
@@ -270,7 +304,20 @@
   {
    "cell_type": "code",
    "source": [
-    "\nfrom sklearn import metrics\nimport numpy as np\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense\n#Define a sequential model and use a dense layer as input with a recitified linear unit activation function \nmodel = Sequential()\nmodel.add(Dense(10, input_dim=x_val_good.shape[1], activation='relu'))\nmodel.add(Dense(3, activation='relu'))\nmodel.add(Dense(10, activation='relu'))\n\nmodel.add(Dense(x_val_good.shape[1])) \nmodel.compile(loss='mean_squared_error',  optimizer='adam') # Run standard mean square error\nmodel.summary()"
+    "\n",
+    "from sklearn import metrics\n",
+    "import numpy as np\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "#Define a sequential model and use a dense layer as input with a recitified linear unit activation function \n",
+    "model = Sequential()\n",
+    "model.add(Dense(10, input_dim=x_val_regular.shape[1], activation='relu'))\n",
+    "model.add(Dense(3, activation='relu'))\n",
+    "model.add(Dense(10, activation='relu'))\n",
+    "\n",
+    "model.add(Dense(x_val_regular.shape[1])) \n",
+    "model.compile(loss='mean_squared_error',  optimizer='adam') # Run standard mean square error\n",
+    "model.summary()"
    ],
    "metadata": {},
    "outputs": [
@@ -289,7 +336,14 @@
   {
    "cell_type": "code",
    "source": [
-    "import tensorflow as tf\n\nfrom keras.callbacks import EarlyStopping\n#Implement early stopping to save on time\nsave_early_callback = EarlyStopping(monitor='loss', min_delta=0,\n                                    patience=3, verbose=1,\n                                    restore_best_weights=True)\nmodel.fit(x_good_train,x_good_train,verbose=1,epochs=100,  callbacks=[save_early_callback])"
+    "import tensorflow as tf\n",
+    "\n",
+    "from keras.callbacks import EarlyStopping\n",
+    "#Implement early stopping to save on time\n",
+    "save_early_callback = EarlyStopping(monitor='loss', min_delta=0,\n",
+    "                                    patience=3, verbose=1,\n",
+    "                                    restore_best_weights=True)\n",
+    "model.fit(x_regular_train,x_regular_train,verbose=1,epochs=100,  callbacks=[save_early_callback])"
    ],
    "metadata": {},
    "outputs": [
@@ -308,7 +362,20 @@
   {
    "cell_type": "code",
    "source": [
-    "# Check prediction against the regular customer validation test data set\nprediction = model.predict(x_good_test)\ncust_test_score = np.sqrt(metrics.mean_squared_error(prediction,x_good_test))\nprint(f\"Customer Test Score : {cust_test_score} \\n\")\n# Check prediction against the regular customer validation data set\nprediction = model.predict(x_val_good)\ncust_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_good))\nprint(f\"Customer Validation  Score : {cust_validation_score} \\n\")\n# Check prediction against the irregular customer validation data set\nprediction = model.predict(x_val_bad)\ncust_irreg_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_bad))\n# massive reconstruction error will manifest here run on the entire dataset\nprint(f\"Irregular Customer Validation  Score : {cust_irreg_validation_score} \\n\")\n\n"
+    "# Check prediction against the regular customer validation test data set\n",
+    "prediction = model.predict(x_regular_test)\n",
+    "cust_test_score = np.sqrt(metrics.mean_squared_error(prediction,x_regular_test))\n",
+    "print(f\"Customer Test Score : {cust_test_score} \\n\")\n",
+    "# Check prediction against the regular customer validation data set\n",
+    "prediction = model.predict(x_val_regular)\n",
+    "cust_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_regular))\n",
+    "print(f\"Customer Validation  Score : {cust_validation_score} \\n\")\n",
+    "# Check prediction against the irregular customer validation data set\n",
+    "prediction = model.predict(x_val_irregular)\n",
+    "cust_irreg_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_irregular))\n",
+    "# massive reconstruction error will manifest here run on the entire dataset\n",
+    "print(f\"Irregular Customer Validation  Score : {cust_irreg_validation_score} \\n\")\n",
+    "\n"
    ],
    "metadata": {},
    "outputs": [

diff --git a/autoencoder_anomaly_detection.ipynb b/autoencoder_anomaly_detection.ipynb
diff --git a/autoencoder_anomaly_detection.ipynb b/autoencoder_anomaly_detection.ipynb
diff --git a/autoencoder_anomaly_detection.ipynb b/autoencoder_anomaly_detection.ipynb
diff --git a/autoencoder_anomaly_detection.ipynb b/autoencoder_anomaly_detection.ipynb
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -1 +0,0 @@
-README

diff --git a/autoencoder_anomaly_detection.ipynb b/autoencoder_anomaly_detection.ipynb
@@ -0,0 +1 @@
+README
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1 @@
+README
No results found