{ "cells": [ { "metadata": { "trusted": true }, "cell_type": "code", "source": "from fastai import * # Quick access to most common functionality\nfrom fastai.tabular import * # Quick access to tabular functionality\nfrom fastai.docs import * # Access to example data provided with fastai", "execution_count": 1, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "df = get_adult()\ntrain_df, valid_df = df[:-2000].copy(),df[-2000:].copy()\ntrain_df.head()", "execution_count": 2, "outputs": [ { "output_type": "execute_result", "execution_count": 2, "data": { "text/plain": " age workclass fnlwgt education education-num \\\n0 49 Private 101320 Assoc-acdm 12.0 \n1 44 Private 236746 Masters 14.0 \n2 38 Private 96185 HS-grad NaN \n3 38 Self-emp-inc 112847 Prof-school 15.0 \n4 42 Self-emp-not-inc 82297 7th-8th NaN \n\n marital-status occupation relationship race \\\n0 Married-civ-spouse NaN Wife White \n1 Divorced Exec-managerial Not-in-family White \n2 Divorced NaN Unmarried Black \n3 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander \n4 Married-civ-spouse Other-service Wife Black \n\n sex capital-gain capital-loss hours-per-week native-country >=50k \n0 Female 0 1902 40 United-States 1 \n1 Male 10520 0 45 United-States 1 \n2 Female 0 0 32 United-States 0 \n3 Male 0 0 40 United-States 1 \n4 Female 0 0 50 United-States 0 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country>=50k
049Private101320Assoc-acdm12.0Married-civ-spouseNaNWifeWhiteFemale0190240United-States1
144Private236746Masters14.0DivorcedExec-managerialNot-in-familyWhiteMale10520045United-States1
238Private96185HS-gradNaNDivorcedNaNUnmarriedBlackFemale0032United-States0
338Self-emp-inc112847Prof-school15.0Married-civ-spouseProf-specialtyHusbandAsian-Pac-IslanderMale0040United-States1
442Self-emp-not-inc822977th-8thNaNMarried-civ-spouseOther-serviceWifeBlackFemale0050United-States0
\n
" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "dep_var = '>=50k'\ncat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']\ndata = tabular_data_from_df(ADULT_PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_names)", "execution_count": 3, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "from pandas_summary import DataFrameSummary\nfrom sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\nfrom sklearn import metrics", "execution_count": 4, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "data.train_ds.conts.shape, data.train_ds.cats.shape", "execution_count": 5, "outputs": [ { "output_type": "execute_result", "execution_count": 5, "data": { "text/plain": "(torch.Size([30561, 6]), torch.Size([30561, 9]))" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "cats,conts = data.train_ds.cats.numpy(), data.train_ds.conts.numpy()", "execution_count": 6, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "df = np.concatenate((cats, conts), axis=1)", "execution_count": 7, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "df.shape", "execution_count": 8, "outputs": [ { "output_type": "execute_result", "execution_count": 8, "data": { "text/plain": "(30561, 15)" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "y = data.train_ds.y.numpy()", "execution_count": 9, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "m = RandomForestClassifier(n_jobs=-1)", "execution_count": 10, "outputs": [] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "m.fit(df, y)\nm.score(df,y)", "execution_count": 11, "outputs": [ { "output_type": "execute_result", "execution_count": 11, "data": { "text/plain": "0.9880893949805307" }, "metadata": {} } ] }, { "metadata": { "trusted": true }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "name": "python3", "display_name": "Python 3", "language": "python" }, "language_info": { "name": "python", "version": "3.6.4", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "gist": { "id": "", "data": { "description": "data preprocessing with Tabular Module fast.ai", "public": true } } }, "nbformat": 4, "nbformat_minor": 2 }