Last active
          November 21, 2017 22:22 
        
      - 
      
 - 
        
Save bsweger/935128d4f5b54d413a83 to your computer and use it in GitHub Desktop.  
Revisions
- 
        
bsweger revised this gist
Aug 1, 2014 . 1 changed file with 13 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,7 @@ { "metadata": { "name": "", "signature": "sha256:8150453367b906e07eef200351901fb9c424c6eeeb84438199e61c4a49c6d449" }, "nbformat": 3, "nbformat_minor": 0, @@ -33,21 +33,21 @@ "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "import glob" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "gross = pd.DataFrame()\n", "grossfiles = glob.glob('gross/gross????.csv')\n", "for file in grossfiles:\n", " df = pd.read_csv(\n", " file, \n", " header=0,\n", @@ -81,7 +81,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 29 }, { "cell_type": "heading", @@ -109,9 +109,8 @@ "collapsed": false, "input": [ "refunds = pd.DataFrame()\n", "refundfiles = glob.glob('refunds/refunds????.csv')\n", "for file in refundfiles:\n", " df = pd.read_csv(\n", " file, \n", " header=0,\n", @@ -140,7 +139,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "heading", @@ -166,7 +165,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", @@ -177,7 +176,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 32 }, { "cell_type": "code", @@ -188,7 +187,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 } ], "metadata": {}  - 
        
bsweger revised this gist
Jul 31, 2014 . 1 changed file with 1 addition and 12 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,7 @@ { "metadata": { "name": "", "signature": "sha256:eb74777a052264aae3749a175943a15d17dd907b18bc57761b73c885d9a67cc1" }, "nbformat": 3, "nbformat_minor": 0, @@ -142,17 +142,6 @@ "outputs": [], "prompt_number": 38 }, { "cell_type": "heading", "level": 2,  - 
        
bsweger revised this gist
Jul 25, 2014 . 1 changed file with 10 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,7 @@ { "metadata": { "name": "", "signature": "sha256:08455faf1ffcde2b6c482f4b8542781aea6c495ea5e0160f21db3b139958ddbd" }, "nbformat": 3, "nbformat_minor": 0, @@ -38,7 +38,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", @@ -70,7 +70,7 @@ "#hacky stuff to get rid of non-numbers\n", "for col in gross.columns.values:\n", " if col <> 'state':\n", " gross[col] = gross[col].replace('[^0-9.]+', '', regex=True) \n", "\n", "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n", "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n", @@ -81,7 +81,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "heading", @@ -130,7 +130,7 @@ "#hacky stuff to get rid of non-numbers\n", "for col in refunds.columns.values:\n", " if col <> 'state':\n", " refunds[col] = refunds[col].replace('[^0-9.]+', '', regex=True) \n", "\n", "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n", "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n", @@ -140,7 +140,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 38 }, { "cell_type": "code", @@ -151,7 +151,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 39 }, { "cell_type": "heading", @@ -177,7 +177,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", @@ -188,7 +188,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 42 }, { "cell_type": "code", @@ -199,7 +199,7 @@ "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 } ], "metadata": {}  - 
        
bsweger created this gist
Jul 24, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,208 @@ { "metadata": { "name": "", "signature": "sha256:0bcd9f4735d0a57d6a01f8e156f3cb70f2f0ee17e26e59360a69ca1ddbe8c068" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Gross Tax Collections by State" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Gross-Collections,-by-Type-of-Tax-and-State,-Fiscal-Year-IRS-Data-Book-Table-5\n", "\n", "**Notes:**\n", "\n", "* Estate and trust income tax is tax on the income (or loss) of an estate or a trust. Prior to FY 2008, this amount was reported as part of the individual income tax total\n", "* Estate tax is the tax on a property transfer at death" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 294 }, { "cell_type": "code", "collapsed": false, "input": [ "gross = pd.DataFrame()\n", "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n", "for year in years:\n", " file = 'gross/%sIRSGross.csv' % year\n", " df = pd.read_csv(\n", " file, \n", " header=0,\n", " thousands=',',\n", " names=['state', 'total','business','individual_total', \n", " 'individual_witheld_fica', 'individual_payments_seca', \n", " 'ui', 'rr', 'estate_trust', 'estate', 'gift', 'excise']\n", " )\n", " df.dropna(subset=['state'],inplace=True)\n", " df['year'] = year\n", " gross = pd.concat([gross, df])\n", "\n", "#hacky stuff to fix up state names\n", "gross['state'] = gross['state'].str.strip()\n", "gross['state'] = gross['state'].str.replace('United States, total \\[4\\]', 'United States')\n", "gross['state'] = gross['state'].str.replace('US Totals', 'United States')\n", "gross['state'] = gross['state'].str.replace('US Total', 'United States')\n", "#gross.replace('*','')\n", "\n", "#hacky stuff to get rid of non-numbers\n", "for col in gross.columns.values:\n", " if col <> 'state':\n", " gross[col] = gross[col].replace('[^0-9]+', '', regex=True) \n", "\n", "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n", "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n", "gross['ui'] = gross['ui'].astype(str).convert_objects(convert_numeric=True)\n", "gross['estate'] = gross['estate'].astype(str).convert_objects(convert_numeric=True)\n", "gross['gift'] = gross['gift'].astype(str).convert_objects(convert_numeric=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 295 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Tax Refunds by State" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "source = IRS http://www.irs.gov/uac/SOI-Tax-Stats-Amount-of-Refunds-Issued,-Including-Interest,-by-State-and-Fiscal-Year-IRS-Data-Book-Table-8\n", "\n", "**Notes**\n", "\n", "* total tax refunds include overpayment refunds, refunds resulting from examination activity, refundable tax credits, and other refunds required by law. Also includes interest.\n", "* employment refunds include refunds of self-employment income taxes under the Self Employment Insurance Contributions Act (SECA); railroad retirement taxes under the Railroad Retirement Tax Act (RRTA); unemployment insurance taxes under the Federal Unemployment Tax Act (FUTA); and Old-Age, Survivors, Disability, and Hospital Insurance (OASDHI) taxes on salaries and wages under the Federal Insurance Contributions Act (FICA)\n", "* excise tax refunds exclude refunds of excise taxes paid from U.S. Customs and Border Protection and the Alcohol Tobacco Tax and Trade Bureau" ] }, { "cell_type": "code", "collapsed": false, "input": [ "refunds = pd.DataFrame()\n", "years = range(2002,2014) #pre-2002, IRS lumped MD and DC together\n", "for year in years:\n", " file = 'refunds/Refunds%s.csv' % year\n", " df = pd.read_csv(\n", " file, \n", " header=0,\n", " thousands=',',\n", " names=['state', 'total','business','individual_income', \n", " 'estate_trust', 'employment', 'estate', 'gift', 'excise'],\n", " )\n", " df.dropna(subset=['state'],inplace=True)\n", " df['year'] = year\n", " refunds = pd.concat([refunds, df])\n", " \n", "#hacky stuff to fix up state names\n", "refunds['state'] = refunds['state'].str.strip()\n", "refunds['state'] = refunds['state'].str.replace('US Total', 'United States')\n", "\n", "#hacky stuff to get rid of non-numbers\n", "for col in refunds.columns.values:\n", " if col <> 'state':\n", " refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True) \n", "\n", "#hacky stuff to force datatypes to numeric even when there's incoming non-numeric data\n", "#(see http://stackoverflow.com/questions/17457418/converting-pandas-dataframe-types)\n", "refunds['estate'] = refunds['estate'].astype(str).convert_objects(convert_numeric=True)\n", "refunds['gift'] = refunds['gift'].astype(str).convert_objects(convert_numeric=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 296 }, { "cell_type": "code", "collapsed": false, "input": [ "gross.to_csv('grossall.csv',index=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 297 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Net Taxes by State" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#create stripped-down, standardized versions of gross collections and refunds, so we can calculate net\n", "g = gross[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift', 'excise']]\n", "g.set_index(['year', 'state'], inplace=True)\n", "r = refunds\n", "r['estate_trust'] = r['estate_trust'].fillna(0)\n", "r['individual_total'] = r['individual_income'] + r['estate_trust'] + r['employment']\n", "r = r[['year', 'state', 'total', 'business', 'individual_total', 'estate', 'gift','excise']]\n", "r.set_index(['year', 'state'], inplace=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 298 }, { "cell_type": "code", "collapsed": false, "input": [ "net = g.sub(r)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 299 }, { "cell_type": "code", "collapsed": false, "input": [ "net.to_csv('net_taxes.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 300 } ], "metadata": {} } ] }