{
  "cells": [
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from fastai import *          # Quick access to most common functionality\nfrom fastai.tabular import *  # Quick access to tabular functionality\nfrom fastai.docs import *     # Access to example data provided with fastai",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = get_adult()\ntrain_df, valid_df = df[:-2000].copy(),df[-2000:].copy()\ntrain_df.head()",
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 2,
          "data": {
            "text/plain": "   age          workclass  fnlwgt     education  education-num  \\\n0   49            Private  101320    Assoc-acdm           12.0   \n1   44            Private  236746       Masters           14.0   \n2   38            Private   96185       HS-grad            NaN   \n3   38       Self-emp-inc  112847   Prof-school           15.0   \n4   42   Self-emp-not-inc   82297       7th-8th            NaN   \n\n        marital-status        occupation    relationship                 race  \\\n0   Married-civ-spouse               NaN            Wife                White   \n1             Divorced   Exec-managerial   Not-in-family                White   \n2             Divorced               NaN       Unmarried                Black   \n3   Married-civ-spouse    Prof-specialty         Husband   Asian-Pac-Islander   \n4   Married-civ-spouse     Other-service            Wife                Black   \n\n       sex  capital-gain  capital-loss  hours-per-week  native-country  >=50k  \n0   Female             0          1902              40   United-States      1  \n1     Male         10520             0              45   United-States      1  \n2   Female             0             0              32   United-States      0  \n3     Male             0             0              40   United-States      1  \n4   Female             0             0              50   United-States      0  ",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>age</th>\n      <th>workclass</th>\n      <th>fnlwgt</th>\n      <th>education</th>\n      <th>education-num</th>\n      <th>marital-status</th>\n      <th>occupation</th>\n      <th>relationship</th>\n      <th>race</th>\n      <th>sex</th>\n      <th>capital-gain</th>\n      <th>capital-loss</th>\n      <th>hours-per-week</th>\n      <th>native-country</th>\n      <th>&gt;=50k</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>49</td>\n      <td>Private</td>\n      <td>101320</td>\n      <td>Assoc-acdm</td>\n      <td>12.0</td>\n      <td>Married-civ-spouse</td>\n      <td>NaN</td>\n      <td>Wife</td>\n      <td>White</td>\n      <td>Female</td>\n      <td>0</td>\n      <td>1902</td>\n      <td>40</td>\n      <td>United-States</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>44</td>\n      <td>Private</td>\n      <td>236746</td>\n      <td>Masters</td>\n      <td>14.0</td>\n      <td>Divorced</td>\n      <td>Exec-managerial</td>\n      <td>Not-in-family</td>\n      <td>White</td>\n      <td>Male</td>\n      <td>10520</td>\n      <td>0</td>\n      <td>45</td>\n      <td>United-States</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>38</td>\n      <td>Private</td>\n      <td>96185</td>\n      <td>HS-grad</td>\n      <td>NaN</td>\n      <td>Divorced</td>\n      <td>NaN</td>\n      <td>Unmarried</td>\n      <td>Black</td>\n      <td>Female</td>\n      <td>0</td>\n      <td>0</td>\n      <td>32</td>\n      <td>United-States</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>38</td>\n      <td>Self-emp-inc</td>\n      <td>112847</td>\n      <td>Prof-school</td>\n      <td>15.0</td>\n      <td>Married-civ-spouse</td>\n      <td>Prof-specialty</td>\n      <td>Husband</td>\n      <td>Asian-Pac-Islander</td>\n      <td>Male</td>\n      <td>0</td>\n      <td>0</td>\n      <td>40</td>\n      <td>United-States</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>42</td>\n      <td>Self-emp-not-inc</td>\n      <td>82297</td>\n      <td>7th-8th</td>\n      <td>NaN</td>\n      <td>Married-civ-spouse</td>\n      <td>Other-service</td>\n      <td>Wife</td>\n      <td>Black</td>\n      <td>Female</td>\n      <td>0</td>\n      <td>0</td>\n      <td>50</td>\n      <td>United-States</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "dep_var = '>=50k'\ncat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']\ndata = tabular_data_from_df(ADULT_PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_names)",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from pandas_summary import DataFrameSummary\nfrom sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\nfrom sklearn import metrics",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "data.train_ds.conts.shape, data.train_ds.cats.shape",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 5,
          "data": {
            "text/plain": "(torch.Size([30561, 6]), torch.Size([30561, 9]))"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "cats,conts = data.train_ds.cats.numpy(), data.train_ds.conts.numpy()",
      "execution_count": 6,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df = np.concatenate((cats, conts), axis=1)",
      "execution_count": 7,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "df.shape",
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 8,
          "data": {
            "text/plain": "(30561, 15)"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "y = data.train_ds.y.numpy()",
      "execution_count": 9,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "m = RandomForestClassifier(n_jobs=-1)",
      "execution_count": 10,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "m.fit(df, y)\nm.score(df,y)",
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 11,
          "data": {
            "text/plain": "0.9880893949805307"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.4",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "gist": {
      "id": "",
      "data": {
        "description": "data preprocessing with Tabular Module fast.ai",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}