@@ -3,7 +3,8 @@
'numbers' : range (1000000 )})
import pickle
import cPickle
# Python 3 has no cPickle
#import cPickle
import json
from functools import partial
from time import time
@@ -36,17 +37,26 @@ def jsonloads(text):
return pd .Series (values , index = index )
keys = ['json-no-index' , 'json' , 'pickle' , 'pickle-p2' , 'cPickle' , 'cPickle-p2' , 'msgpack' , 'csv' , 'hdfstore' ]
d = {'pickle' : [pickle .loads , pickle .dumps ],
'cPickle' : [cPickle .loads , cPickle .dumps ],
'pickle-p2' : [pickle .loads , partial (pickle .dumps , protocol = 2 )],
'cPickle-p2' : [cPickle .loads , partial (cPickle .dumps , protocol = 2 )],
'msgpack' : [pd .read_msgpack , pd .Series .to_msgpack ],
'csv' : [csvloads , csvdumps ],
'hdfstore' : [hdfloads , hdfdumps ],
'json-no-index' : [json .loads , lambda x : json .dumps (list (x ))],
'json' : [jsonloads , lambda x : json .dumps ([list (x .index ), list (x )])]}
keys = ['json-no-index' , 'json-no-index-native' , 'json' , 'json-native' , 'pickle' , 'pickle-p2' , 'pickle-p4' , 'msgpack' , 'csv' , 'hdfstore' ]
d = {
'pickle' : [pickle .loads , pickle .dumps ],
# 'cPickle': [cPickle.loads, cPickle.dumps],
'pickle-p2' : [pickle .loads , partial (pickle .dumps , protocol = 2 )],
'pickle-p4' : [pickle .loads , partial (pickle .dumps , protocol = 4 )],
# 'cPickle-p2': [cPickle.loads, partial(cPickle.dumps, protocol=2)],
'msgpack' : [pd .read_msgpack , pd .Series .to_msgpack ],
'csv' : [csvloads , csvdumps ],
'hdfstore' : [hdfloads , hdfdumps ],
'json-no-index' : [json .loads , lambda x : json .dumps ([int (y ) for y in x ])],
'json-no-index-native' : [
lambda x : pd .Series (pd .json .decode (x )), lambda x : x .to_json (orient = 'values' )
],
'json' : [
jsonloads , lambda x : json .dumps ([[int (y ) for y in x .index ], [int (y ) for y in x ]])
],
'json-native' : [lambda x : pd .Series (pd .json .decode (x )), lambda x : x .to_json ()]
}
result = dict ()
@@ -73,56 +83,43 @@ def jsonloads(text):
w , h = 7 , 7
f , (left , right ) = plt .subplots (nrows = 1 , ncols = 2 , sharex = True , figsize = (w * 2 , h ), squeeze = True )
df = pd .DataFrame ({'loads' : [result [key ]['text' ]['loads' ] for key in keys ],
new_df = pd .DataFrame ({'loads' : [result [key ]['text' ]['loads' ] for key in keys ],
'dumps' : [result [key ]['text' ]['dumps' ] for key in keys ],
'storage' : keys })
df = pd .melt (df , "storage" , value_name = "duration" , var_name = "operation" )
new_df = pd .melt (new_df , "storage" , value_name = "duration" , var_name = "operation" )
sns .barplot ("duration" , "storage" , "operation" , data = df , ax = left )
sns .barplot ("duration" , "storage" , "operation" , data = new_df , ax = left )
left .set (xlabel = "Duration (s)" , ylabel = "" )
sns .despine (bottom = True )
left .set_title ('Cost to Serialize Text' )
left .legend (loc = "lower center" , ncol = 2 , frameon = True , title = "operation" )
df = pd .DataFrame ({'loads' : [result [key ]['numbers' ]['loads' ] for key in keys ],
new_df = pd .DataFrame ({'loads' : [result [key ]['numbers' ]['loads' ] for key in keys ],
'dumps' : [result [key ]['numbers' ]['dumps' ] for key in keys ],
'storage' : keys })
df = pd .melt (df , "storage" , value_name = "duration" , var_name = "operation" )
new_df = pd .melt (new_df , "storage" , value_name = "duration" , var_name = "operation" )
sns .barplot ("duration" , "storage" , "operation" , data = df , ax = right )
sns .barplot ("duration" , "storage" , "operation" , data = new_df , ax = right )
right .set (xlabel = "Duration (s)" , ylabel = "" )
sns .despine (bottom = True )
right .set_title ('Cost to Serialize Numerical Data' )
right .legend (loc = "lower center" , ncol = 2 , frameon = True , title = "operation" )
plt .savefig ('../images/serialize.png' )
f , ax = plt .subplots (nrows = 1 , ncols = 1 , sharex = True , figsize = (w , h ), squeeze = True )
keys2 = ['pickle-p2' , 'cPickle-p2' , 'msgpack' , 'hdfstore' ]
df = pd .DataFrame ({'loads' : [result [key ]['numbers' ]['loads' ] for key in keys2 ],
'dumps' : [result [key ]['numbers' ]['dumps' ] for key in keys2 ],
'storage' : keys2 })
df = pd .melt (df , "storage" , value_name = "duration" , var_name = "operation" )
sns .barplot ("duration" , "storage" , "operation" , data = df , ax = ax )
ax .set (xlabel = "Duration (s)" , ylabel = "" )
sns .despine (bottom = True )
ax .set_title ('Cost to Serialize Numerical Data' )
ax .legend (loc = "lower center" , ncol = 2 , frameon = True , title = "operation" )
plt .savefig ('../images/serialize-subset.png' )
'
df = pd .DataFrame ({'loads' : [result [key ]['text' ]['loads' ] for key in keys ],
new_df = pd .DataFrame ({'loads' : [result [key ]['text' ]['loads' ] for key in keys ],
'dumps' : [result [key ]['text' ]['dumps' ] for key in keys ],
'storage' : keys })
df2 = df .copy ()
start = time ()
df2 [ 'text' ] = df2 [ 'text' ].astype ('category' )
df2 . loc [:, 'text' ] = df2 . loc [:, 'text' ].astype ('category' )
end = time ()
categories = {'convert' : end - start ,
'text' : timeit (lambda : cPickle .loads (cPickle .dumps (df .text , protocol = 2 ))),
'categories' : timeit (lambda : cPickle .loads (cPickle .dumps (df2 .text , protocol = 2 )))}
'text' : timeit (lambda : pickle .loads (pickle .dumps (df .text , protocol = 2 ))),
'categories' : timeit (lambda : pickle .loads (pickle .dumps (df2 .text , protocol = 2 )))}
print pd .DataFrame (pd .Series (categories , name = 'seconds' , index = ['text' , 'convert' , 'categories' ])).to_html ()
plt .show ()
print ( pd .DataFrame (pd .Series (categories , name = 'seconds' , index = ['text' , 'convert' , 'categories' ])).to_html () )
plt .show ()