Comparing length of json-encoded DataFrame. [to_json doc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) Conclusions with my sample dataset * `CSV` * can be signifantly more compact if your data has lots of `nan`s / *strings*, since their encoding is empty / not-quoted * but index is lost * and careful with incompatible quoting if you embed csv in json * `split` is the most compact `json` encoding that keeps column and index names, use it over `columns` (default) / `record` * but index *name* is lost * and [MultiIndex does not wround-trip](https://github.com/pandas-dev/pandas/issues/4889) * `table` is the json mode with best roundtrip support, but is 3x bigger * an additional 2x (csv) / 3x (split) can be gained with bz2 + base64 at the cost of CPU/complexity * and an additional -7% size with base85 at a 13x cpu cost ```python for orient in 'records split index columns values table'.split(): j = o.to_json(orient=orient) print(orient, len(j), j[:500], j[-100:], sep='\n') import pickle print('pickle', len(pickle.dumps(o))) import zlib import bz2 import pickle import base64 import pyarrow import pyarrow.parquet df = o for orient in 'records split index columns values table'.split(): print(orient, len(df.to_json(orient=orient))) # csv 64 dfcsv = df.to_csv().encode() print('~csv', len(dfcsv)) print('csv.b64', len(json.dumps(base64.b64encode(dfcsv).decode()))) dfcsv_gz_64 = base64.b64encode(zlib.compress(dfcsv)).decode() print('csv.gz.b64', len(json.dumps(dfcsv_gz_64))) dfcsv_bz_64 = base64.b64encode(bz2.compress(dfcsv)).decode() print('csv.bz.b64', len(json.dumps(dfcsv_bz_64))) # pickle base 64 (&85 which is 13x slower) pik = pickle.dumps(df) print('~pickle', len(pik)) print('pickle.b64', len(json.dumps(base64.b64encode(pik).decode()))) pickle_gz_64 = base64.b64encode(bz2.compress(pik)).decode() pickle_gz_85 = base64.b85encode(bz2.compress(pik)).decode() print('pickle.bz.b64 b85', len(json.dumps(pickle_gz_64)), len(json.dumps(pickle_gz_85))) # parquet 64 table = pyarrow.Table.from_pandas(df) buf = pyarrow.BufferOutputStream() pyarrow.parquet.write_table(table, buf) park = buf.getvalue().to_pybytes() print('~parquet', len(park)) print('parquet.b64', len(json.dumps(base64.b64encode(park).decode()))) park_gz_64 = base64.b64encode(zlib.compress(park)).decode() print('parquet.gz.b64', len(json.dumps(park_gz_64))) park_bz_64 = base64.b64encode(bz2.compress(park)).decode() print('parquet.bz.b64', len(json.dumps(park_bz_64))) # split base64 split = df.to_json(orient='split').encode() print('split', len(split)) print('~split.gz', len(zlib.compress(split))) splitgz64 = base64.b64encode(zlib.compress(split)).decode() print('split.gz.b64', len(json.dumps(splitgz64))) splitbz64 = base64.b64encode(bz2.compress(split)).decode() print('split.bz.b64', len(json.dumps(splitbz64))) ``` Results: ``` records 68728 split 37894 index 79318 columns 78468 values 27227 table 87551 csv 21561 csv.gz.b64 12602 csv.bz.b64 11166 ~pickle 46633 pickle.b64 62182 pickle.bz.b64 b85 16138 15129 ~parquet 22832 parquet.b64 30446 parquet.gz.b64 21214 parquet.bz.b64 22994 split 37894 ~split.gz 10353 split.gz.b64 13806 split.bz.b64 11550 ``` ✕ records -- 68728 -- no index! ``` [ {"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447}, {"column1":null,"column2":null,"column3":null,"column4":9.3134,"column5":null}, {"column1":null,"column2":null,"column3":16.7342,"column4":23.3783,"column5":null}, ... {"column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null} ] ``` ✓ split -- 37894 -- smallest complete since no repetition of columns etc. ``` { "columns":["column1","column2","column3","column4","column5"], "index":[null,"index1","index2","index3",... "data": [ [587.295,null,192.5252,null,null], [null,null,61.2949,129.5094,null], ... [152.7775,null,null,null,null] ] } ``` ✓ index -- 79318 ``` { "nan": {"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447}, "index1":{"column1":null,"column2":null,"column3":null,"column4":9.3134,"column5":null}, "index2":{"column1":null,"column2":null,"column3":16.7342,"column4":23.3783,"column5":null}, ... "index999":{"column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null} } ``` ✓ columns -- 78468 -- verbose if index is verbose ``` {"column1": {"nan":null,"index1":null,"ALAKC08AA":5.466,"ALAKC08EE":0.9689,...} ... {"column5": ... {..., "index999":null} } ``` ✕ values -- 27227 -- no index, no column names ! ``` [ [null,null,null,null,145502.6447], [null,null,null,6.5745,null], ... [152.7775,null,null,null,null] ] ``` ✓ table -- 87551 -- ``` {"schema": { "fields": [{"name":"office","type":"string"},{"name":"column1","type":"number"},{"name":"column2","type":"number"},{"name":"column3","type":"number"},{"name":"column4","type":"number"},{"name":"column5","type":"number"}], "primaryKey":["office"], "pandas_version":"0.20.0" }, "data":[ {"office":null,"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447} {"office":"index2" , ...} ... {"office":"index999","column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null} ] } ```