|
14 | 14 | import datetime |
15 | 15 | from io import StringIO |
16 | 16 | import itertools |
17 | | -import sys |
18 | 17 | from textwrap import dedent |
19 | 18 | from typing import ( |
20 | 19 | IO, |
|
131 | 130 |
|
132 | 131 | from pandas.io.common import get_filepath_or_buffer |
133 | 132 | from pandas.io.formats import console, format as fmt |
134 | | -from pandas.io.formats.printing import pprint_thing |
| 133 | +from pandas.io.formats.info import info |
135 | 134 | import pandas.plotting |
136 | 135 |
|
137 | 136 | if TYPE_CHECKING: |
@@ -2225,282 +2224,11 @@ def to_html( |
2225 | 2224 | ) |
2226 | 2225 |
|
2227 | 2226 | # ---------------------------------------------------------------------- |
2228 | | - |
| 2227 | + @Appender(info.__doc__) |
2229 | 2228 | def info( |
2230 | 2229 | self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None |
2231 | 2230 | ) -> None: |
2232 | | - """ |
2233 | | - Print a concise summary of a DataFrame. |
2234 | | -
|
2235 | | - This method prints information about a DataFrame including |
2236 | | - the index dtype and column dtypes, non-null values and memory usage. |
2237 | | -
|
2238 | | - Parameters |
2239 | | - ---------- |
2240 | | - verbose : bool, optional |
2241 | | - Whether to print the full summary. By default, the setting in |
2242 | | - ``pandas.options.display.max_info_columns`` is followed. |
2243 | | - buf : writable buffer, defaults to sys.stdout |
2244 | | - Where to send the output. By default, the output is printed to |
2245 | | - sys.stdout. Pass a writable buffer if you need to further process |
2246 | | - the output. |
2247 | | - max_cols : int, optional |
2248 | | - When to switch from the verbose to the truncated output. If the |
2249 | | - DataFrame has more than `max_cols` columns, the truncated output |
2250 | | - is used. By default, the setting in |
2251 | | - ``pandas.options.display.max_info_columns`` is used. |
2252 | | - memory_usage : bool, str, optional |
2253 | | - Specifies whether total memory usage of the DataFrame |
2254 | | - elements (including the index) should be displayed. By default, |
2255 | | - this follows the ``pandas.options.display.memory_usage`` setting. |
2256 | | -
|
2257 | | - True always show memory usage. False never shows memory usage. |
2258 | | - A value of 'deep' is equivalent to "True with deep introspection". |
2259 | | - Memory usage is shown in human-readable units (base-2 |
2260 | | - representation). Without deep introspection a memory estimation is |
2261 | | - made based in column dtype and number of rows assuming values |
2262 | | - consume the same memory amount for corresponding dtypes. With deep |
2263 | | - memory introspection, a real memory usage calculation is performed |
2264 | | - at the cost of computational resources. |
2265 | | - null_counts : bool, optional |
2266 | | - Whether to show the non-null counts. By default, this is shown |
2267 | | - only if the frame is smaller than |
2268 | | - ``pandas.options.display.max_info_rows`` and |
2269 | | - ``pandas.options.display.max_info_columns``. A value of True always |
2270 | | - shows the counts, and False never shows the counts. |
2271 | | -
|
2272 | | - Returns |
2273 | | - ------- |
2274 | | - None |
2275 | | - This method prints a summary of a DataFrame and returns None. |
2276 | | -
|
2277 | | - See Also |
2278 | | - -------- |
2279 | | - DataFrame.describe: Generate descriptive statistics of DataFrame |
2280 | | - columns. |
2281 | | - DataFrame.memory_usage: Memory usage of DataFrame columns. |
2282 | | -
|
2283 | | - Examples |
2284 | | - -------- |
2285 | | - >>> int_values = [1, 2, 3, 4, 5] |
2286 | | - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] |
2287 | | - >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] |
2288 | | - >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, |
2289 | | - ... "float_col": float_values}) |
2290 | | - >>> df |
2291 | | - int_col text_col float_col |
2292 | | - 0 1 alpha 0.00 |
2293 | | - 1 2 beta 0.25 |
2294 | | - 2 3 gamma 0.50 |
2295 | | - 3 4 delta 0.75 |
2296 | | - 4 5 epsilon 1.00 |
2297 | | -
|
2298 | | - Prints information of all columns: |
2299 | | -
|
2300 | | - >>> df.info(verbose=True) |
2301 | | - <class 'pandas.core.frame.DataFrame'> |
2302 | | - RangeIndex: 5 entries, 0 to 4 |
2303 | | - Data columns (total 3 columns): |
2304 | | - # Column Non-Null Count Dtype |
2305 | | - --- ------ -------------- ----- |
2306 | | - 0 int_col 5 non-null int64 |
2307 | | - 1 text_col 5 non-null object |
2308 | | - 2 float_col 5 non-null float64 |
2309 | | - dtypes: float64(1), int64(1), object(1) |
2310 | | - memory usage: 248.0+ bytes |
2311 | | -
|
2312 | | - Prints a summary of columns count and its dtypes but not per column |
2313 | | - information: |
2314 | | -
|
2315 | | - >>> df.info(verbose=False) |
2316 | | - <class 'pandas.core.frame.DataFrame'> |
2317 | | - RangeIndex: 5 entries, 0 to 4 |
2318 | | - Columns: 3 entries, int_col to float_col |
2319 | | - dtypes: float64(1), int64(1), object(1) |
2320 | | - memory usage: 248.0+ bytes |
2321 | | -
|
2322 | | - Pipe output of DataFrame.info to buffer instead of sys.stdout, get |
2323 | | - buffer content and writes to a text file: |
2324 | | -
|
2325 | | - >>> import io |
2326 | | - >>> buffer = io.StringIO() |
2327 | | - >>> df.info(buf=buffer) |
2328 | | - >>> s = buffer.getvalue() |
2329 | | - >>> with open("df_info.txt", "w", |
2330 | | - ... encoding="utf-8") as f: # doctest: +SKIP |
2331 | | - ... f.write(s) |
2332 | | - 260 |
2333 | | -
|
2334 | | - The `memory_usage` parameter allows deep introspection mode, specially |
2335 | | - useful for big DataFrames and fine-tune memory optimization: |
2336 | | -
|
2337 | | - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) |
2338 | | - >>> df = pd.DataFrame({ |
2339 | | - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), |
2340 | | - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), |
2341 | | - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) |
2342 | | - ... }) |
2343 | | - >>> df.info() |
2344 | | - <class 'pandas.core.frame.DataFrame'> |
2345 | | - RangeIndex: 1000000 entries, 0 to 999999 |
2346 | | - Data columns (total 3 columns): |
2347 | | - # Column Non-Null Count Dtype |
2348 | | - --- ------ -------------- ----- |
2349 | | - 0 column_1 1000000 non-null object |
2350 | | - 1 column_2 1000000 non-null object |
2351 | | - 2 column_3 1000000 non-null object |
2352 | | - dtypes: object(3) |
2353 | | - memory usage: 22.9+ MB |
2354 | | -
|
2355 | | - >>> df.info(memory_usage='deep') |
2356 | | - <class 'pandas.core.frame.DataFrame'> |
2357 | | - RangeIndex: 1000000 entries, 0 to 999999 |
2358 | | - Data columns (total 3 columns): |
2359 | | - # Column Non-Null Count Dtype |
2360 | | - --- ------ -------------- ----- |
2361 | | - 0 column_1 1000000 non-null object |
2362 | | - 1 column_2 1000000 non-null object |
2363 | | - 2 column_3 1000000 non-null object |
2364 | | - dtypes: object(3) |
2365 | | - memory usage: 188.8 MB |
2366 | | - """ |
2367 | | - if buf is None: # pragma: no cover |
2368 | | - buf = sys.stdout |
2369 | | - |
2370 | | - lines = [] |
2371 | | - |
2372 | | - lines.append(str(type(self))) |
2373 | | - lines.append(self.index._summary()) |
2374 | | - |
2375 | | - if len(self.columns) == 0: |
2376 | | - lines.append(f"Empty {type(self).__name__}") |
2377 | | - fmt.buffer_put_lines(buf, lines) |
2378 | | - return |
2379 | | - |
2380 | | - cols = self.columns |
2381 | | - col_count = len(self.columns) |
2382 | | - |
2383 | | - # hack |
2384 | | - if max_cols is None: |
2385 | | - max_cols = get_option("display.max_info_columns", len(self.columns) + 1) |
2386 | | - |
2387 | | - max_rows = get_option("display.max_info_rows", len(self) + 1) |
2388 | | - |
2389 | | - if null_counts is None: |
2390 | | - show_counts = (col_count <= max_cols) and (len(self) < max_rows) |
2391 | | - else: |
2392 | | - show_counts = null_counts |
2393 | | - exceeds_info_cols = col_count > max_cols |
2394 | | - |
2395 | | - def _verbose_repr(): |
2396 | | - lines.append(f"Data columns (total {len(self.columns)} columns):") |
2397 | | - |
2398 | | - id_head = " # " |
2399 | | - column_head = "Column" |
2400 | | - col_space = 2 |
2401 | | - |
2402 | | - max_col = max(len(pprint_thing(k)) for k in cols) |
2403 | | - len_column = len(pprint_thing(column_head)) |
2404 | | - space = max(max_col, len_column) + col_space |
2405 | | - |
2406 | | - max_id = len(pprint_thing(col_count)) |
2407 | | - len_id = len(pprint_thing(id_head)) |
2408 | | - space_num = max(max_id, len_id) + col_space |
2409 | | - counts = None |
2410 | | - |
2411 | | - header = _put_str(id_head, space_num) + _put_str(column_head, space) |
2412 | | - if show_counts: |
2413 | | - counts = self.count() |
2414 | | - if len(cols) != len(counts): # pragma: no cover |
2415 | | - raise AssertionError( |
2416 | | - f"Columns must equal counts ({len(cols)} != {len(counts)})" |
2417 | | - ) |
2418 | | - count_header = "Non-Null Count" |
2419 | | - len_count = len(count_header) |
2420 | | - non_null = " non-null" |
2421 | | - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) |
2422 | | - space_count = max(len_count, max_count) + col_space |
2423 | | - count_temp = "{count}" + non_null |
2424 | | - else: |
2425 | | - count_header = "" |
2426 | | - space_count = len(count_header) |
2427 | | - len_count = space_count |
2428 | | - count_temp = "{count}" |
2429 | | - |
2430 | | - dtype_header = "Dtype" |
2431 | | - len_dtype = len(dtype_header) |
2432 | | - max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) |
2433 | | - space_dtype = max(len_dtype, max_dtypes) |
2434 | | - header += _put_str(count_header, space_count) + _put_str( |
2435 | | - dtype_header, space_dtype |
2436 | | - ) |
2437 | | - |
2438 | | - lines.append(header) |
2439 | | - lines.append( |
2440 | | - _put_str("-" * len_id, space_num) |
2441 | | - + _put_str("-" * len_column, space) |
2442 | | - + _put_str("-" * len_count, space_count) |
2443 | | - + _put_str("-" * len_dtype, space_dtype) |
2444 | | - ) |
2445 | | - |
2446 | | - for i, col in enumerate(self.columns): |
2447 | | - dtype = self.dtypes.iloc[i] |
2448 | | - col = pprint_thing(col) |
2449 | | - |
2450 | | - line_no = _put_str(f" {i}", space_num) |
2451 | | - count = "" |
2452 | | - if show_counts: |
2453 | | - count = counts.iloc[i] |
2454 | | - |
2455 | | - lines.append( |
2456 | | - line_no |
2457 | | - + _put_str(col, space) |
2458 | | - + _put_str(count_temp.format(count=count), space_count) |
2459 | | - + _put_str(dtype, space_dtype) |
2460 | | - ) |
2461 | | - |
2462 | | - def _non_verbose_repr(): |
2463 | | - lines.append(self.columns._summary(name="Columns")) |
2464 | | - |
2465 | | - def _sizeof_fmt(num, size_qualifier): |
2466 | | - # returns size in human readable format |
2467 | | - for x in ["bytes", "KB", "MB", "GB", "TB"]: |
2468 | | - if num < 1024.0: |
2469 | | - return f"{num:3.1f}{size_qualifier} {x}" |
2470 | | - num /= 1024.0 |
2471 | | - return f"{num:3.1f}{size_qualifier} PB" |
2472 | | - |
2473 | | - if verbose: |
2474 | | - _verbose_repr() |
2475 | | - elif verbose is False: # specifically set to False, not nesc None |
2476 | | - _non_verbose_repr() |
2477 | | - else: |
2478 | | - if exceeds_info_cols: |
2479 | | - _non_verbose_repr() |
2480 | | - else: |
2481 | | - _verbose_repr() |
2482 | | - |
2483 | | - counts = self._data.get_dtype_counts() |
2484 | | - dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] |
2485 | | - lines.append(f"dtypes: {', '.join(dtypes)}") |
2486 | | - |
2487 | | - if memory_usage is None: |
2488 | | - memory_usage = get_option("display.memory_usage") |
2489 | | - if memory_usage: |
2490 | | - # append memory usage of df to display |
2491 | | - size_qualifier = "" |
2492 | | - if memory_usage == "deep": |
2493 | | - deep = True |
2494 | | - else: |
2495 | | - # size_qualifier is just a best effort; not guaranteed to catch |
2496 | | - # all cases (e.g., it misses categorical data even with object |
2497 | | - # categories) |
2498 | | - deep = False |
2499 | | - if "object" in counts or self.index._is_memory_usage_qualified(): |
2500 | | - size_qualifier = "+" |
2501 | | - mem_usage = self.memory_usage(index=True, deep=deep).sum() |
2502 | | - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") |
2503 | | - fmt.buffer_put_lines(buf, lines) |
| 2231 | + return info(self, verbose, buf, max_cols, memory_usage, null_counts) |
2504 | 2232 |
|
2505 | 2233 | def memory_usage(self, index=True, deep=False) -> Series: |
2506 | 2234 | """ |
@@ -8623,7 +8351,3 @@ def _from_nested_dict(data): |
8623 | 8351 | new_data[col] = new_data.get(col, {}) |
8624 | 8352 | new_data[col][index] = v |
8625 | 8353 | return new_data |
8626 | | - |
8627 | | - |
8628 | | -def _put_str(s, space): |
8629 | | - return str(s)[:space].ljust(space) |
0 commit comments