@@ -5159,85 +5159,112 @@ easy conversion to and from pandas.
51595159Performance Considerations
51605160--------------------------
51615161
5162- This is an informal comparison of various IO methods, using pandas 0.13.1.
5162+ This is an informal comparison of various IO methods, using pandas
5163+ 0.20.3. Timings are machine dependent and small differences should be
5164+ ignored.
51635165
51645166.. code-block :: ipython
51655167
5166- In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB'))
5168+ In [1]: sz = 1000000
5169+ In [2]: df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz})
51675170
5168- In [2 ]: df.info()
5171+ In [3 ]: df.info()
51695172 <class 'pandas.core.frame.DataFrame'>
5170- Int64Index : 1000000 entries, 0 to 999999
5173+ RangeIndex : 1000000 entries, 0 to 999999
51715174 Data columns (total 2 columns):
51725175 A 1000000 non-null float64
5173- B 1000000 non-null float64
5174- dtypes: float64(2 )
5175- memory usage: 22.9 MB
5176+ B 1000000 non-null int64
5177+ dtypes: float64(1), int64(1 )
5178+ memory usage: 15.3 MB
51765179
51775180 Writing
51785181
51795182.. code-block :: ipython
51805183
51815184 In [14]: %timeit test_sql_write(df)
5182- 1 loops, best of 3: 6.24 s per loop
5185+ 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
51835186
51845187 In [15]: %timeit test_hdf_fixed_write(df)
5185- 1 loops, best of 3: 237 ms per loop
5188+ 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
51865189
51875190 In [26]: %timeit test_hdf_fixed_write_compress(df)
5188- 1 loops, best of 3: 245 ms per loop
5191+ 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
51895192
51905193 In [16]: %timeit test_hdf_table_write(df)
5191- 1 loops, best of 3: 901 ms per loop
5194+ 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
51925195
51935196 In [27]: %timeit test_hdf_table_write_compress(df)
5194- 1 loops, best of 3: 952 ms per loop
5197+ 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
51955198
51965199 In [17]: %timeit test_csv_write(df)
5197- 1 loops, best of 3: 3.44 s per loop
5200+ 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5201+
5202+ In [30]: %timeit test_feather_write(df)
5203+ 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5204+
5205+ In [31]: %timeit test_pickle_write(df)
5206+ 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5207+
5208+ In [32]: %timeit test_pickle_write_compress(df)
5209+ 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
51985210
51995211 Reading
52005212
52015213.. code-block :: ipython
52025214
52035215 In [18]: %timeit test_sql_read()
5204- 1 loops, best of 3: 766 ms per loop
5216+ 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
52055217
52065218 In [19]: %timeit test_hdf_fixed_read()
5207- 10 loops, best of 3: 19.1 ms per loop
5219+ 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
52085220
52095221 In [28]: %timeit test_hdf_fixed_read_compress()
5210- 10 loops, best of 3: 36.3 ms per loop
5222+ 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
52115223
52125224 In [20]: %timeit test_hdf_table_read()
5213- 10 loops, best of 3: 39 ms per loop
5225+ 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
52145226
52155227 In [29]: %timeit test_hdf_table_read_compress()
5216- 10 loops, best of 3: 60.6 ms per loop
5228+ 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
52175229
52185230 In [22]: %timeit test_csv_read()
5219- 1 loops, best of 3: 620 ms per loop
5231+ 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5232+
5233+ In [33]: %timeit test_feather_read()
5234+ 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5235+
5236+ In [34]: %timeit test_pickle_read()
5237+ 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5238+
5239+ In [35]: %timeit test_pickle_read_compress()
5240+ 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
52205241
52215242 Space on disk (in bytes)
52225243
52235244.. code-block :: none
52245245
5225- 25843712 Apr 8 14:11 test.sql
5226- 24007368 Apr 8 14:11 test_fixed.hdf
5227- 15580682 Apr 8 14:11 test_fixed_compress.hdf
5228- 24458444 Apr 8 14:11 test_table.hdf
5229- 16797283 Apr 8 14:11 test_table_compress.hdf
5230- 46152810 Apr 8 14:11 test.csv
5246+ 34816000 Aug 21 18:00 test.sql
5247+ 24009240 Aug 21 18:00 test_fixed.hdf
5248+ 7919610 Aug 21 18:00 test_fixed_compress.hdf
5249+ 24458892 Aug 21 18:00 test_table.hdf
5250+ 8657116 Aug 21 18:00 test_table_compress.hdf
5251+ 28520770 Aug 21 18:00 test.csv
5252+ 16000248 Aug 21 18:00 test.feather
5253+ 16000848 Aug 21 18:00 test.pkl
5254+ 7554108 Aug 21 18:00 test.pkl.compress
52315255
52325256 And here's the code
52335257
52345258.. code-block :: python
52355259
5236- import sqlite3
52375260 import os
5261+ import pandas as pd
5262+ import sqlite3
5263+ from numpy.random import randn
52385264 from pandas.io import sql
52395265
5240- df = pd.DataFrame(randn(1000000 ,2 ),columns = list (' AB' ))
5266+ sz = 1000000
5267+ df = pd.DataFrame({' A' : randn(sz), ' B' : [1 ] * sz})
52415268
52425269 def test_sql_write (df ):
52435270 if os.path.exists(' test.sql' ):
@@ -5280,3 +5307,21 @@ And here's the code
52805307
52815308 def test_csv_read ():
52825309 pd.read_csv(' test.csv' ,index_col = 0 )
5310+
5311+ def test_feather_write (df ):
5312+ df.to_feather(' test.feather' )
5313+
5314+ def test_feather_read ():
5315+ pd.read_feather(' test.feather' )
5316+
5317+ def test_pickle_write (df ):
5318+ df.to_pickle(' test.pkl' )
5319+
5320+ def test_pickle_read ():
5321+ pd.read_pickle(' test.pkl' )
5322+
5323+ def test_pickle_write_compress (df ):
5324+ df.to_pickle(' test.pkl.compress' , compression = ' xz' )
5325+
5326+ def test_pickle_read_compress ():
5327+ pd.read_pickle(' test.pkl.compress' , compression = ' xz' )
0 commit comments