Skip to content

Instantly share code, notes, and snippets.

@vishwanath79
Last active March 30, 2021 02:47
Show Gist options
  • Select an option

  • Save vishwanath79/67ef098022f545fe71246ce8975f9bb8 to your computer and use it in GitHub Desktop.

Select an option

Save vishwanath79/67ef098022f545fe71246ce8975f9bb8 to your computer and use it in GitHub Desktop.

Revisions

  1. vishwanath79 revised this gist Jul 12, 2020. 1 changed file with 81 additions and 14 deletions.
    95 changes: 81 additions & 14 deletions autoencoder_anomaly_detection.ipynb
    Original file line number Diff line number Diff line change
    @@ -42,7 +42,14 @@
    {
    "cell_type": "code",
    "source": [
    "from pyspark.sql.functions import lit\n# Add some cosmetic colums to simulate synthetic data\nuser_df = user_df.withColumn(\n \"product\",\n lit(\"coffee\")\n ).withColumn(\n \"behavior\",lit(\"normal\")\n )\n"
    "from pyspark.sql.functions import lit\n",
    "# Add some cosmetic colums to simulate synthetic data\n",
    "user_df = user_df.withColumn(\n",
    " \"product\",\n",
    " lit(\"coffee\")\n",
    " ).withColumn(\n",
    " \"behavior\",lit(\"normal\")\n",
    " )\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -61,7 +68,8 @@
    {
    "cell_type": "code",
    "source": [
    "# Raw data\nuser_df.show(5)"
    "# Raw data\n",
    "user_df.show(5)"
    ],
    "metadata": {},
    "outputs": [
    @@ -80,7 +88,12 @@
    {
    "cell_type": "code",
    "source": [
    "\n#Some data cleanup steps to aggregate by visits and sums since Faker gives you some duplicate dates\n\nuser_df = user_df.groupby('dates','product','behavior','visits').agg({\"amount\":'max'})\nuser_df = user_df.groupby('dates','product','behavior','max(amount)').agg({\"visits\":'max'}).withColumnRenamed(\"max(visits)\",\"visits\").withColumnRenamed(\"max(amount)\",\"amount\" )\n\n"
    "\n",
    "#Some data cleanup steps to aggregate by visits and sums since Faker gives you some duplicate dates\n",
    "\n",
    "user_df = user_df.groupby('dates','product','behavior','visits').agg({\"amount\":'max'})\n",
    "user_df = user_df.groupby('dates','product','behavior','max(amount)').agg({\"visits\":'max'}).withColumnRenamed(\"max(visits)\",\"visits\").withColumnRenamed(\"max(amount)\",\"amount\" )\n",
    "\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -99,7 +112,9 @@
    {
    "cell_type": "code",
    "source": [
    "#set condition for annotating customer behavior i.e. Regular (0) when amount spent > 150 else a non-regular (1)\nfrom pyspark.sql.functions import col, when \nuser_df = user_df.withColumn(\"behavior\", when(col('amount') > 150 , 0).otherwise( 1))\n"
    "#set condition for annotating customer behavior i.e. Regular (0) when amount spent > 150 else a non-regular (1)\n",
    "from pyspark.sql.functions import col, when \n",
    "user_df = user_df.withColumn(\"behavior\", when(col('amount') > 150 , 0).otherwise( 1))\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -137,7 +152,8 @@
    {
    "cell_type": "code",
    "source": [
    "#Validate count of data , since we had 1000 records initially\nuser_df.groupBy('behavior').count().show()"
    "#Validate count of data , since we had 1000 records initially\n",
    "user_df.groupBy('behavior').count().show()"
    ],
    "metadata": {},
    "outputs": [
    @@ -156,7 +172,8 @@
    {
    "cell_type": "code",
    "source": [
    "#drop columns we dont need like the date column since we are not doing a time series analysis here only anamoly detection\nuser_df = user_df.drop('dates')\n"
    "#drop columns we dont need like the date column since we are not doing a time series analysis here only anamoly detection\n",
    "user_df = user_df.drop('dates')\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -175,7 +192,9 @@
    {
    "cell_type": "code",
    "source": [
    "good_records = user_df['behavior'] == 0\nbad_records = user_df['behavior'] == 1\n# Classify the customer behavior into integers (0 and 1). Note we want to train the datapoint only with the good values so any anamoly is detected when a devation from the trained data is experienced.\n"
    "regular_records = user_df['behavior'] == 0\n",
    "irregular_records = user_df['behavior'] == 1\n",
    "# Classify the customer behavior into integers (0 and 1). Note we want to train the datapoint only with the regular values so any anamoly is detected when a devation from the trained data is experienced.\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -194,7 +213,9 @@
    {
    "cell_type": "code",
    "source": [
    "#lets split the datasets into regular and non regular customers\nuser_df_good = user_df[good_records] # only good values\nuser_df_bad = user_df[bad_records] # only bad data points\n"
    "#lets split the datasets into regular and non regular customers\n",
    "user_df_regular = user_df[regular_records] # only regular values\n",
    "user_df_irregular = user_df[irregular_records] # only irregular data points\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -213,7 +234,16 @@
    {
    "cell_type": "code",
    "source": [
    "#Drop the columsn we dont need from these datasets\nuser_df_good = user_df_good.drop('behavior','product')\nuser_df_bad = user_df_bad.drop('behavior','product')\n# Ran into some issues loading the Spark dataframes into the keras tensors, was easier to just convert to pandas and let it do its thing\nuser_df_good = user_df_good.select(\"*\").toPandas()\nuser_df_bad = user_df_bad.select(\"*\").toPandas()\n#we want to feed only the values of the array into the neural net not column headers \nx_val_good = user_df_good.values\nx_val_bad = user_df_bad.values\nx_val_good,x_val_bad"
    "#Drop the columsn we dont need from these datasets\n",
    "user_df_regular = user_df_regular.drop('behavior','product')\n",
    "user_df_irregular = user_df_irregular.drop('behavior','product')\n",
    "# Ran into some issues loading the Spark dataframes into the keras tensors, was easier to just convert to pandas and let it do its thing\n",
    "user_df_regular = user_df_regular.select(\"*\").toPandas()\n",
    "user_df_irregular = user_df_irregular.select(\"*\").toPandas()\n",
    "#we want to feed only the values of the array into the neural net not column headers \n",
    "x_val_regular = user_df_regular.values\n",
    "x_val_irregular = user_df_irregular.values\n",
    "x_val_regular,x_val_irregular"
    ],
    "metadata": {},
    "outputs": [
    @@ -232,7 +262,10 @@
    {
    "cell_type": "code",
    "source": [
    "# split the regular customer data into test and training set\nfrom sklearn.model_selection import train_test_split\nx_good_train, x_good_test = train_test_split(\n x_val_good, test_size=0.25, random_state=42)\n"
    "# split the regular customer data into test and training set\n",
    "from sklearn.model_selection import train_test_split\n",
    "x_regular_train, x_regular_test = train_test_split(\n",
    " x_val_regular, test_size=0.25, random_state=42)\n"
    ],
    "metadata": {},
    "outputs": [
    @@ -251,7 +284,8 @@
    {
    "cell_type": "code",
    "source": [
    "print(f\"Regular customers train count: {len(x_good_train)} \\n\")\nprint(f\"Regular customers test count: {len(x_good_test)} \\n\") # going to be held out for test validation, note these add up to 750 i.e. validates our original counts"
    "print(f\"Regular customers train count: {len(x_regular_train)} \\n\")\n",
    "print(f\"Regular customers test count: {len(x_regular_test)} \\n\") # going to be held out for test validation, note these add up to 750 i.e. validates our original counts"
    ],
    "metadata": {},
    "outputs": [
    @@ -270,7 +304,20 @@
    {
    "cell_type": "code",
    "source": [
    "\nfrom sklearn import metrics\nimport numpy as np\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense\n#Define a sequential model and use a dense layer as input with a recitified linear unit activation function \nmodel = Sequential()\nmodel.add(Dense(10, input_dim=x_val_good.shape[1], activation='relu'))\nmodel.add(Dense(3, activation='relu'))\nmodel.add(Dense(10, activation='relu'))\n\nmodel.add(Dense(x_val_good.shape[1])) \nmodel.compile(loss='mean_squared_error', optimizer='adam') # Run standard mean square error\nmodel.summary()"
    "\n",
    "from sklearn import metrics\n",
    "import numpy as np\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense\n",
    "#Define a sequential model and use a dense layer as input with a recitified linear unit activation function \n",
    "model = Sequential()\n",
    "model.add(Dense(10, input_dim=x_val_regular.shape[1], activation='relu'))\n",
    "model.add(Dense(3, activation='relu'))\n",
    "model.add(Dense(10, activation='relu'))\n",
    "\n",
    "model.add(Dense(x_val_regular.shape[1])) \n",
    "model.compile(loss='mean_squared_error', optimizer='adam') # Run standard mean square error\n",
    "model.summary()"
    ],
    "metadata": {},
    "outputs": [
    @@ -289,7 +336,14 @@
    {
    "cell_type": "code",
    "source": [
    "import tensorflow as tf\n\nfrom keras.callbacks import EarlyStopping\n#Implement early stopping to save on time\nsave_early_callback = EarlyStopping(monitor='loss', min_delta=0,\n patience=3, verbose=1,\n restore_best_weights=True)\nmodel.fit(x_good_train,x_good_train,verbose=1,epochs=100, callbacks=[save_early_callback])"
    "import tensorflow as tf\n",
    "\n",
    "from keras.callbacks import EarlyStopping\n",
    "#Implement early stopping to save on time\n",
    "save_early_callback = EarlyStopping(monitor='loss', min_delta=0,\n",
    " patience=3, verbose=1,\n",
    " restore_best_weights=True)\n",
    "model.fit(x_regular_train,x_regular_train,verbose=1,epochs=100, callbacks=[save_early_callback])"
    ],
    "metadata": {},
    "outputs": [
    @@ -308,7 +362,20 @@
    {
    "cell_type": "code",
    "source": [
    "# Check prediction against the regular customer validation test data set\nprediction = model.predict(x_good_test)\ncust_test_score = np.sqrt(metrics.mean_squared_error(prediction,x_good_test))\nprint(f\"Customer Test Score : {cust_test_score} \\n\")\n# Check prediction against the regular customer validation data set\nprediction = model.predict(x_val_good)\ncust_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_good))\nprint(f\"Customer Validation Score : {cust_validation_score} \\n\")\n# Check prediction against the irregular customer validation data set\nprediction = model.predict(x_val_bad)\ncust_irreg_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_bad))\n# massive reconstruction error will manifest here run on the entire dataset\nprint(f\"Irregular Customer Validation Score : {cust_irreg_validation_score} \\n\")\n\n"
    "# Check prediction against the regular customer validation test data set\n",
    "prediction = model.predict(x_regular_test)\n",
    "cust_test_score = np.sqrt(metrics.mean_squared_error(prediction,x_regular_test))\n",
    "print(f\"Customer Test Score : {cust_test_score} \\n\")\n",
    "# Check prediction against the regular customer validation data set\n",
    "prediction = model.predict(x_val_regular)\n",
    "cust_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_regular))\n",
    "print(f\"Customer Validation Score : {cust_validation_score} \\n\")\n",
    "# Check prediction against the irregular customer validation data set\n",
    "prediction = model.predict(x_val_irregular)\n",
    "cust_irreg_validation_score = np.sqrt(metrics.mean_squared_error(prediction,x_val_irregular))\n",
    "# massive reconstruction error will manifest here run on the entire dataset\n",
    "print(f\"Irregular Customer Validation Score : {cust_irreg_validation_score} \\n\")\n",
    "\n"
    ],
    "metadata": {},
    "outputs": [
  2. vishwanath79 revised this gist Jul 12, 2020. 1 changed file with 334 additions and 1 deletion.
    335 changes: 334 additions & 1 deletion autoencoder_anomaly_detection.ipynb
    334 additions, 1 deletion not shown because the diff is too large. Please use a local Git client to view these changes.
  3. vishwanath79 revised this gist Jul 12, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion autoencoder_anomaly_detection.ipynb
    1 addition, 1 deletion not shown because the diff is too large. Please use a local Git client to view these changes.
  4. vishwanath79 revised this gist Jul 12, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion autoencoder_anomaly_detection.ipynb
    1 addition, 1 deletion not shown because the diff is too large. Please use a local Git client to view these changes.
  5. vishwanath79 revised this gist Jul 12, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion autoencoder_anomaly_detection.ipynb
    1 addition, 1 deletion not shown because the diff is too large. Please use a local Git client to view these changes.
  6. vishwanath79 revised this gist Jul 12, 2020. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -1 +0,0 @@
    README
  7. vishwanath79 created this gist Jul 12, 2020.
    1 change: 1 addition & 0 deletions autoencoder_anomaly_detection.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    README
    1 change: 1 addition & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    README