Skip to content

Instantly share code, notes, and snippets.

@bsweger
Last active November 21, 2017 22:22
Show Gist options
  • Save bsweger/935128d4f5b54d413a83 to your computer and use it in GitHub Desktop.
Save bsweger/935128d4f5b54d413a83 to your computer and use it in GitHub Desktop.

Revisions

  1. bsweger revised this gist Aug 1, 2014. 1 changed file with 13 additions and 14 deletions.
    27 changes: 13 additions & 14 deletions Taxes Paid and Personal Income.ipynb
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,7 @@
    {
    "metadata": {
    "name": "",
    "signature": "sha256:eb74777a052264aae3749a175943a15d17dd907b18bc57761b73c885d9a67cc1"
    "signature": "sha256:8150453367b906e07eef200351901fb9c424c6eeeb84438199e61c4a49c6d449"
    },
    "nbformat": 3,
    "nbformat_minor": 0,
    @@ -33,21 +33,21 @@
    "collapsed": false,
    "input": [
    "import numpy as np\n",
    "import pandas as pd"
    "import pandas as pd\n",
    "import glob"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 36
    "prompt_number": 28
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "gross = pd.DataFrame()\n",
    "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
    "for year in years:\n",
    " file = 'gross/%sIRSGross.csv' % year\n",
    "grossfiles = glob.glob('gross/gross????.csv')\n",
    "for file in grossfiles:\n",
    " df = pd.read_csv(\n",
    " file, \n",
    " header=0,\n",
    @@ -81,7 +81,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 37
    "prompt_number": 29
    },
    {
    "cell_type": "heading",
    @@ -109,9 +109,8 @@
    "collapsed": false,
    "input": [
    "refunds = pd.DataFrame()\n",
    "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
    "for year in years:\n",
    " file = 'refunds/Refunds%s.csv' % year\n",
    "refundfiles = glob.glob('refunds/refunds????.csv')\n",
    "for file in refundfiles:\n",
    " df = pd.read_csv(\n",
    " file, \n",
    " header=0,\n",
    @@ -140,7 +139,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 38
    "prompt_number": 30
    },
    {
    "cell_type": "heading",
    @@ -166,7 +165,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 40
    "prompt_number": 31
    },
    {
    "cell_type": "code",
    @@ -177,7 +176,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 42
    "prompt_number": 32
    },
    {
    "cell_type": "code",
    @@ -188,7 +187,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 43
    "prompt_number": 33
    }
    ],
    "metadata": {}
  2. bsweger revised this gist Jul 31, 2014. 1 changed file with 1 addition and 12 deletions.
    13 changes: 1 addition & 12 deletions Taxes Paid and Personal Income.ipynb
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,7 @@
    {
    "metadata": {
    "name": "",
    "signature": "sha256:08455faf1ffcde2b6c482f4b8542781aea6c495ea5e0160f21db3b139958ddbd"
    "signature": "sha256:eb74777a052264aae3749a175943a15d17dd907b18bc57761b73c885d9a67cc1"
    },
    "nbformat": 3,
    "nbformat_minor": 0,
    @@ -142,17 +142,6 @@
    "outputs": [],
    "prompt_number": 38
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "gross.to_csv('grossall.csv',index=False)"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 39
    },
    {
    "cell_type": "heading",
    "level": 2,
  3. bsweger revised this gist Jul 25, 2014. 1 changed file with 10 additions and 10 deletions.
    20 changes: 10 additions & 10 deletions Taxes Paid and Personal Income.ipynb
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,7 @@
    {
    "metadata": {
    "name": "",
    "signature": "sha256:0bcd9f4735d0a57d6a01f8e156f3cb70f2f0ee17e26e59360a69ca1ddbe8c068"
    "signature": "sha256:08455faf1ffcde2b6c482f4b8542781aea6c495ea5e0160f21db3b139958ddbd"
    },
    "nbformat": 3,
    "nbformat_minor": 0,
    @@ -38,7 +38,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 294
    "prompt_number": 36
    },
    {
    "cell_type": "code",
    @@ -70,7 +70,7 @@
    "#hacky stuff to get rid of non-numbers\n",
    "for col in gross.columns.values:\n",
    " if col <> 'state':\n",
    " gross[col] = gross[col].replace('[^0-9]+', '', regex=True) \n",
    " gross[col] = gross[col].replace('[^0-9.]+', '', regex=True) \n",
    "\n",
    "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
    "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
    @@ -81,7 +81,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 295
    "prompt_number": 37
    },
    {
    "cell_type": "heading",
    @@ -130,7 +130,7 @@
    "#hacky stuff to get rid of non-numbers\n",
    "for col in refunds.columns.values:\n",
    " if col <> 'state':\n",
    " refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True) \n",
    " refunds[col] = refunds[col].replace('[^0-9.]+', '', regex=True) \n",
    "\n",
    "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
    "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
    @@ -140,7 +140,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 296
    "prompt_number": 38
    },
    {
    "cell_type": "code",
    @@ -151,7 +151,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 297
    "prompt_number": 39
    },
    {
    "cell_type": "heading",
    @@ -177,7 +177,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 298
    "prompt_number": 40
    },
    {
    "cell_type": "code",
    @@ -188,7 +188,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 299
    "prompt_number": 42
    },
    {
    "cell_type": "code",
    @@ -199,7 +199,7 @@
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 300
    "prompt_number": 43
    }
    ],
    "metadata": {}
  4. bsweger created this gist Jul 24, 2014.
    208 changes: 208 additions & 0 deletions Taxes Paid and Personal Income.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,208 @@
    {
    "metadata": {
    "name": "",
    "signature": "sha256:0bcd9f4735d0a57d6a01f8e156f3cb70f2f0ee17e26e59360a69ca1ddbe8c068"
    },
    "nbformat": 3,
    "nbformat_minor": 0,
    "worksheets": [
    {
    "cells": [
    {
    "cell_type": "heading",
    "level": 2,
    "metadata": {},
    "source": [
    "Gross Tax Collections by State"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Gross-Collections,-by-Type-of-Tax-and-State,-Fiscal-Year-IRS-Data-Book-Table-5\n",
    "\n",
    "**Notes:**\n",
    "\n",
    "* Estate and trust income tax is tax on the income (or loss) of an estate or a trust. Prior to FY 2008, this amount was reported as part of the individual income tax total\n",
    "* Estate tax is the tax on a property transfer at death"
    ]
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "import numpy as np\n",
    "import pandas as pd"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 294
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "gross = pd.DataFrame()\n",
    "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
    "for year in years:\n",
    " file = 'gross/%sIRSGross.csv' % year\n",
    " df = pd.read_csv(\n",
    " file, \n",
    " header=0,\n",
    " thousands=',',\n",
    " names=['state', 'total','business','individual_total', \n",
    " 'individual_witheld_fica', 'individual_payments_seca', \n",
    " 'ui', 'rr', 'estate_trust', 'estate', 'gift', 'excise']\n",
    " )\n",
    " df.dropna(subset=['state'],inplace=True)\n",
    " df['year'] = year\n",
    " gross = pd.concat([gross, df])\n",
    "\n",
    "#hacky stuff to fix up state names\n",
    "gross['state'] = gross['state'].str.strip()\n",
    "gross['state'] = gross['state'].str.replace('United States, total \\[4\\]', 'United States')\n",
    "gross['state'] = gross['state'].str.replace('US Totals', 'United States')\n",
    "gross['state'] = gross['state'].str.replace('US Total', 'United States')\n",
    "#gross.replace('*','')\n",
    "\n",
    "#hacky stuff to get rid of non-numbers\n",
    "for col in gross.columns.values:\n",
    " if col <> 'state':\n",
    " gross[col] = gross[col].replace('[^0-9]+', '', regex=True) \n",
    "\n",
    "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
    "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
    "gross['ui'] = gross['ui'].astype(str).convert_objects(convert_numeric=True)\n",
    "gross['estate'] = gross['estate'].astype(str).convert_objects(convert_numeric=True)\n",
    "gross['gift'] = gross['gift'].astype(str).convert_objects(convert_numeric=True)"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 295
    },
    {
    "cell_type": "heading",
    "level": 2,
    "metadata": {},
    "source": [
    "Tax Refunds by State"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Amount-of-Refunds-Issued,-Including-Interest,-by-State-and-Fiscal-Year-IRS-Data-Book-Table-8\n",
    "\n",
    "**Notes**\n",
    "\n",
    "* total tax refunds include overpayment refunds, refunds resulting from examination activity, refundable tax credits, and other refunds required by law. Also includes interest.\n",
    "* employment refunds include refunds of self-employment income taxes under the Self Employment Insurance Contributions Act (SECA); railroad retirement taxes under the Railroad Retirement Tax Act (RRTA); unemployment insurance taxes under the Federal Unemployment Tax Act (FUTA); and Old-Age, Survivors, Disability, and Hospital Insurance (OASDHI) taxes on salaries and wages under the Federal Insurance Contributions Act (FICA)\n",
    "* excise tax refunds exclude refunds of excise taxes paid from U.S. Customs and Border Protection and the Alcohol Tobacco Tax and Trade Bureau"
    ]
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "refunds = pd.DataFrame()\n",
    "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n",
    "for year in years:\n",
    " file = 'refunds/Refunds%s.csv' % year\n",
    " df = pd.read_csv(\n",
    " file, \n",
    " header=0,\n",
    " thousands=',',\n",
    " names=['state', 'total','business','individual_income', \n",
    " 'estate_trust', 'employment', 'estate', 'gift', 'excise'],\n",
    " )\n",
    " df.dropna(subset=['state'],inplace=True)\n",
    " df['year'] = year\n",
    " refunds = pd.concat([refunds, df])\n",
    " \n",
    "#hacky stuff to fix up state names\n",
    "refunds['state'] = refunds['state'].str.strip()\n",
    "refunds['state'] = refunds['state'].str.replace('US Total', 'United States')\n",
    "\n",
    "#hacky stuff to get rid of non-numbers\n",
    "for col in refunds.columns.values:\n",
    " if col <> 'state':\n",
    " refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True) \n",
    "\n",
    "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n",
    "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n",
    "refunds['estate'] = refunds['estate'].astype(str).convert_objects(convert_numeric=True)\n",
    "refunds['gift'] = refunds['gift'].astype(str).convert_objects(convert_numeric=True)"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 296
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "gross.to_csv('grossall.csv',index=False)"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 297
    },
    {
    "cell_type": "heading",
    "level": 2,
    "metadata": {},
    "source": [
    "Net Taxes by State"
    ]
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "#create stripped-down, standardized versions of gross collections and refunds, so we can calculate net\n",
    "g = gross[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift', 'excise']]\n",
    "g.set_index(['year', 'state'], inplace=True)\n",
    "r = refunds\n",
    "r['estate_trust'] = r['estate_trust'].fillna(0)\n",
    "r['individual_total'] = r['individual_income'] + r['estate_trust'] + r['employment']\n",
    "r = r[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift','excise']]\n",
    "r.set_index(['year', 'state'], inplace=True)"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 298
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "net = g.sub(r)"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 299
    },
    {
    "cell_type": "code",
    "collapsed": false,
    "input": [
    "net.to_csv('net_taxes.csv')"
    ],
    "language": "python",
    "metadata": {},
    "outputs": [],
    "prompt_number": 300
    }
    ],
    "metadata": {}
    }
    ]
    }