import pandas import numpy as np awards = pandas.read_csv("awards_dump_2014-06-20.csv", parse_dates=['StartDate', 'LastAmendmentDate', 'ExpirationDate', 'AwardedAmountToDate'], converters={'AwardedAmountToDate': lambda x: float(x.replace('$', '').replace(',', ''))}, dtype={'AwardedAmountToDate':np.float64}) awards['DurationYears'] = ((awards['ExpirationDate'] - awards['StartDate']) / (365.25 * np.timedelta64(1, 'D'))) # unsure why, but AwardedAmountToDate wasn't correctly converted to float awards['AwardedAmountToDate'] = awards['AwardedAmountToDate'].astype(float) # A few awards have ExpirationDate < StartDate or implausibly long durations. # Let's eliminate those. (That's 17 out of ~6500) awards = awards[(awards['DurationYears'] > 0) & (awards['DurationYears'] <= 11.)]