bsweger · November 21, 2017 22:22 · Aug 1, 2014 · Jul 31, 2014 · Jul 25, 2014 · Jul 24, 2014
diff --git a/Taxes Paid and Personal Income.ipynb b/Taxes Paid and Personal Income.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:eb74777a052264aae3749a175943a15d17dd907b18bc57761b73c885d9a67cc1"
+  "signature": "sha256:8150453367b906e07eef200351901fb9c424c6eeeb84438199e61c4a49c6d449"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -33,21 +33,21 @@
      "collapsed": false,
      "input": [
       "import numpy as np\n",
-      "import pandas as pd"
+      "import pandas as pd\n",
+      "import glob"
      ],
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 36
+     "prompt_number": 28
     },
     {
      "cell_type": "code",
      "collapsed": false,
      "input": [
       "gross = pd.DataFrame()\n",
-      "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
-      "for year in years:\n",
-      "    file = 'gross/%sIRSGross.csv' % year\n",
+      "grossfiles = glob.glob('gross/gross????.csv')\n",
+      "for file in grossfiles:\n",
       "    df = pd.read_csv(\n",
       "        file, \n",
       "        header=0,\n",
@@ -81,7 +81,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 37
+     "prompt_number": 29
     },
     {
      "cell_type": "heading",
@@ -109,9 +109,8 @@
      "collapsed": false,
      "input": [
       "refunds = pd.DataFrame()\n",
-      "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
-      "for year in years:\n",
-      "    file = 'refunds/Refunds%s.csv' % year\n",
+      "refundfiles = glob.glob('refunds/refunds????.csv')\n",
+      "for file in refundfiles:\n",
       "    df = pd.read_csv(\n",
       "        file, \n",
       "        header=0,\n",
@@ -140,7 +139,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 38
+     "prompt_number": 30
     },
     {
      "cell_type": "heading",
@@ -166,7 +165,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 40
+     "prompt_number": 31
     },
     {
      "cell_type": "code",
@@ -177,7 +176,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 42
+     "prompt_number": 32
     },
     {
      "cell_type": "code",
@@ -188,7 +187,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 43
+     "prompt_number": 33
     }
    ],
    "metadata": {}

diff --git a/Taxes Paid and Personal Income.ipynb b/Taxes Paid and Personal Income.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:08455faf1ffcde2b6c482f4b8542781aea6c495ea5e0160f21db3b139958ddbd"
+  "signature": "sha256:eb74777a052264aae3749a175943a15d17dd907b18bc57761b73c885d9a67cc1"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -142,17 +142,6 @@
      "outputs": [],
      "prompt_number": 38
     },
-    {
-     "cell_type": "code",
-     "collapsed": false,
-     "input": [
-      "gross.to_csv('grossall.csv',index=False)"
-     ],
-     "language": "python",
-     "metadata": {},
-     "outputs": [],
-     "prompt_number": 39
-    },
     {
      "cell_type": "heading",
      "level": 2,

diff --git a/Taxes Paid and Personal Income.ipynb b/Taxes Paid and Personal Income.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:0bcd9f4735d0a57d6a01f8e156f3cb70f2f0ee17e26e59360a69ca1ddbe8c068"
+  "signature": "sha256:08455faf1ffcde2b6c482f4b8542781aea6c495ea5e0160f21db3b139958ddbd"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -38,7 +38,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 294
+     "prompt_number": 36
     },
     {
      "cell_type": "code",
@@ -70,7 +70,7 @@
       "#hacky stuff to get rid of non-numbers\n",
       "for col in gross.columns.values:\n",
       "    if col <> 'state':\n",
-      "        gross[col] = gross[col].replace('[^0-9]+', '', regex=True) \n",
+      "        gross[col] = gross[col].replace('[^0-9.]+', '', regex=True) \n",
       "\n",
       "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
       "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
@@ -81,7 +81,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 295
+     "prompt_number": 37
     },
     {
      "cell_type": "heading",
@@ -130,7 +130,7 @@
       "#hacky stuff to get rid of non-numbers\n",
       "for col in refunds.columns.values:\n",
       "    if col <> 'state':\n",
-      "        refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True)  \n",
+      "        refunds[col] = refunds[col].replace('[^0-9.]+', '', regex=True)  \n",
       "\n",
       "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
       "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
@@ -140,7 +140,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 296
+     "prompt_number": 38
     },
     {
      "cell_type": "code",
@@ -151,7 +151,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 297
+     "prompt_number": 39
     },
     {
      "cell_type": "heading",
@@ -177,7 +177,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 298
+     "prompt_number": 40
     },
     {
      "cell_type": "code",
@@ -188,7 +188,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 299
+     "prompt_number": 42
     },
     {
      "cell_type": "code",
@@ -199,7 +199,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 300
+     "prompt_number": 43
     }
    ],
    "metadata": {}

diff --git a/Taxes Paid and Personal Income.ipynb b/Taxes Paid and Personal Income.ipynb
@@ -0,0 +1,208 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:0bcd9f4735d0a57d6a01f8e156f3cb70f2f0ee17e26e59360a69ca1ddbe8c068"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "heading",
+     "level": 2,
+     "metadata": {},
+     "source": [
+      "Gross Tax Collections by State"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Gross-Collections,-by-Type-of-Tax-and-State,-Fiscal-Year-IRS-Data-Book-Table-5\n",
+      "\n",
+      "**Notes:**\n",
+      "\n",
+      "* Estate and trust income tax is tax on the income (or loss) of an estate or a trust. Prior to FY 2008, this amount was reported as part of the individual income tax total\n",
+      "* Estate tax is the tax on a property transfer at death"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import numpy as np\n",
+      "import pandas as pd"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 294
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "gross = pd.DataFrame()\n",
+      "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
+      "for year in years:\n",
+      "    file = 'gross/%sIRSGross.csv' % year\n",
+      "    df = pd.read_csv(\n",
+      "        file, \n",
+      "        header=0,\n",
+      "        thousands=',',\n",
+      "        names=['state', 'total','business','individual_total', \n",
+      "               'individual_witheld_fica', 'individual_payments_seca', \n",
+      "               'ui', 'rr', 'estate_trust', 'estate', 'gift', 'excise']\n",
+      "    )\n",
+      "    df.dropna(subset=['state'],inplace=True)\n",
+      "    df['year'] = year\n",
+      "    gross = pd.concat([gross, df])\n",
+      "\n",
+      "#hacky stuff to fix up state names\n",
+      "gross['state'] = gross['state'].str.strip()\n",
+      "gross['state'] = gross['state'].str.replace('United States, total \\[4\\]', 'United States')\n",
+      "gross['state'] = gross['state'].str.replace('US Totals', 'United States')\n",
+      "gross['state'] = gross['state'].str.replace('US Total', 'United States')\n",
+      "#gross.replace('*','')\n",
+      "\n",
+      "#hacky stuff to get rid of non-numbers\n",
+      "for col in gross.columns.values:\n",
+      "    if col <> 'state':\n",
+      "        gross[col] = gross[col].replace('[^0-9]+', '', regex=True) \n",
+      "\n",
+      "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
+      "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
+      "gross['ui'] = gross['ui'].astype(str).convert_objects(convert_numeric=True)\n",
+      "gross['estate'] = gross['estate'].astype(str).convert_objects(convert_numeric=True)\n",
+      "gross['gift'] = gross['gift'].astype(str).convert_objects(convert_numeric=True)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 295
+    },
+    {
+     "cell_type": "heading",
+     "level": 2,
+     "metadata": {},
+     "source": [
+      "Tax Refunds by State"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Amount-of-Refunds-Issued,-Including-Interest,-by-State-and-Fiscal-Year-IRS-Data-Book-Table-8\n",
+      "\n",
+      "**Notes**\n",
+      "\n",
+      "* total tax refunds include overpayment refunds, refunds resulting from examination activity, refundable tax credits, and other refunds required by law. Also includes interest.\n",
+      "* employment refunds include refunds of self-employment income taxes under the Self Employment Insurance Contributions Act (SECA); railroad retirement taxes under the Railroad Retirement Tax Act (RRTA); unemployment insurance taxes under the Federal Unemployment Tax Act (FUTA); and Old-Age, Survivors, Disability, and Hospital Insurance (OASDHI) taxes on salaries and wages under the Federal Insurance Contributions Act (FICA)\n",
+      "* excise tax refunds exclude refunds of excise taxes paid from U.S. Customs and Border Protection and the Alcohol Tobacco Tax and Trade Bureau"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "refunds = pd.DataFrame()\n",
+      "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
+      "for year in years:\n",
+      "    file = 'refunds/Refunds%s.csv' % year\n",
+      "    df = pd.read_csv(\n",
+      "        file, \n",
+      "        header=0,\n",
+      "        thousands=',',\n",
+      "        names=['state', 'total','business','individual_income', \n",
+      "                'estate_trust', 'employment', 'estate', 'gift', 'excise'],\n",
+      "    )\n",
+      "    df.dropna(subset=['state'],inplace=True)\n",
+      "    df['year'] = year\n",
+      "    refunds = pd.concat([refunds, df])\n",
+      "    \n",
+      "#hacky stuff to fix up state names\n",
+      "refunds['state'] = refunds['state'].str.strip()\n",
+      "refunds['state'] = refunds['state'].str.replace('US Total', 'United States')\n",
+      "\n",
+      "#hacky stuff to get rid of non-numbers\n",
+      "for col in refunds.columns.values:\n",
+      "    if col <> 'state':\n",
+      "        refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True)  \n",
+      "\n",
+      "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
+      "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
+      "refunds['estate'] = refunds['estate'].astype(str).convert_objects(convert_numeric=True)\n",
+      "refunds['gift'] = refunds['gift'].astype(str).convert_objects(convert_numeric=True)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 296
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "gross.to_csv('grossall.csv',index=False)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 297
+    },
+    {
+     "cell_type": "heading",
+     "level": 2,
+     "metadata": {},
+     "source": [
+      "Net Taxes by State"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#create stripped-down, standardized versions of gross collections and refunds, so we can calculate net\n",
+      "g = gross[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift', 'excise']]\n",
+      "g.set_index(['year', 'state'], inplace=True)\n",
+      "r = refunds\n",
+      "r['estate_trust'] = r['estate_trust'].fillna(0)\n",
+      "r['individual_total'] = r['individual_income'] + r['estate_trust'] + r['employment']\n",
+      "r = r[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift','excise']]\n",
+      "r.set_index(['year', 'state'], inplace=True)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 298
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "net = g.sub(r)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 299
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "net.to_csv('net_taxes.csv')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 300
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}