88import numbers
99import collections
1010import warnings
11+ import itertools
1112
12- from itertools import repeat
1313from distutils .version import LooseVersion
1414
1515import numpy as np
4848#############
4949# READ HTML #
5050#############
51- _RE_WHITESPACE = re .compile (r'( [\r\n]+|\s{2,}) ' )
51+ _RE_WHITESPACE = re .compile (r'[\r\n]+|\s{2,}' )
5252
5353
5454def _remove_whitespace (s , regex = _RE_WHITESPACE ):
@@ -100,8 +100,8 @@ def _get_skiprows(skiprows):
100100 elif skiprows is None :
101101 return 0
102102 else :
103- raise TypeError ('{0} is not a valid type for skipping'
104- ' rows' .format (type (skiprows )))
103+ raise TypeError ('{0!r } is not a valid type for skipping'
104+ ' rows' .format (type (skiprows ). __name__ ))
105105
106106
107107def _read (io ):
@@ -127,7 +127,7 @@ def _read(io):
127127 raw_text = io
128128 else :
129129 raise TypeError ("Cannot read object of type "
130- "'{0.__class__.__name__ !r}' " .format (io ))
130+ "{0 !r}" .format (type ( io ). __name__ ))
131131 return raw_text
132132
133133
@@ -587,30 +587,38 @@ def _parse_raw_tfoot(self, table):
587587
588588
589589def _nan_list (n ):
590- return list (repeat (np .nan , n ))
590+ return list (itertools . repeat (np .nan , n ))
591591
592592
593593def _expand_elements (body ):
594594 lens = Series (lmap (len , body ))
595595 lens_max = lens .max ()
596596 not_max = lens [lens != lens_max ]
597597
598- for ind , length in not_max .iteritems ():
598+ for ind , length in compat .iteritems (not_max ):
599599 body [ind ] += _nan_list (lens_max - length )
600600
601601
602602def _data_to_frame (data , header , index_col , skiprows , infer_types ,
603- parse_dates ):
603+ parse_dates , tupleize_cols , thousands ):
604604 head , body , _ = data # _ is footer which is rarely used: ignore for now
605+
606+ if head :
607+ body = [head ] + body
608+
609+ if header is None : # special case when a table has <th> elements
610+ header = 0
611+
612+ # fill out elements of body that are "ragged"
605613 _expand_elements (body )
606- body = [head ] + body
607- import ipdb ; ipdb .set_trace ()
614+
608615 tp = TextParser (body , header = header , index_col = index_col ,
609616 skiprows = _get_skiprows (skiprows ),
610- parse_dates = parse_dates , tupleize_cols = False )
617+ parse_dates = parse_dates , tupleize_cols = tupleize_cols ,
618+ thousands = thousands )
611619 df = tp .read ()
612620
613- if infer_types : # remove in 0.14
621+ if infer_types : # TODO: remove in 0.14
614622 df = df .convert_objects (convert_dates = 'coerce' )
615623 else :
616624 df = df .applymap (compat .text_type )
@@ -687,7 +695,7 @@ def _validate_parser_flavor(flavor):
687695
688696
689697def _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
690- parse_dates , attrs ):
698+ parse_dates , tupleize_cols , thousands , attrs ):
691699 # bonus: re.compile is idempotent under function iteration so you can pass
692700 # a compiled regex to it and it will return itself
693701 flavor = _validate_parser_flavor (flavor )
@@ -709,65 +717,65 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
709717 raise retained
710718
711719 return [_data_to_frame (table , header , index_col , skiprows , infer_types ,
712- parse_dates ) for table in tables ]
720+ parse_dates , tupleize_cols , thousands )
721+ for table in tables ]
713722
714723
715- def read_html (io , match = '.+' , flavor = None , header = 0 , index_col = None ,
716- skiprows = None , infer_types = None , attrs = None , parse_dates = False ):
717- r"""Read an HTML table into a DataFrame.
724+ def read_html (io , match = '.+' , flavor = None , header = None , index_col = None ,
725+ skiprows = None , infer_types = None , attrs = None , parse_dates = False ,
726+ tupleize_cols = False , thousands = ',' ):
727+ r"""Read HTML tables into a ``list`` of DataFrames.
718728
719729 Parameters
720730 ----------
721731 io : str or file-like
722- A string or file like object that can be either a url, a file-like
723- object, or a raw string containing HTML. Note that lxml only accepts
724- the http, ftp and file url protocols. If you have a URI that starts
725- with ``'https'`` you might removing the ``'s'``.
732+ A URL, a file-like object, or a raw string containing HTML. Note that
733+ lxml only accepts the http, ftp and file url protocols. If you have a
734+ URL that starts with ``'https'`` you might removing the ``'s'``.
726735
727- match : str or regex, optional, default '.+'
736+ match : str or compiled regular expression, optional
728737 The set of tables containing text matching this regex or string will be
729738 returned. Unless the HTML is extremely simple you will probably need to
730739 pass a non-empty string here. Defaults to '.+' (match any non-empty
731740 string). The default value will return all tables contained on a page.
732741 This value is converted to a regular expression so that there is
733742 consistent behavior between Beautiful Soup and lxml.
734743
735- flavor : str, container of strings, default ``None``
736- The parsing engine to use under the hood . 'bs4' and 'html5lib' are
737- synonymous with each other, they are both there for backwards
738- compatibility. The default of ``None`` tries to use ``lxml`` to parse
739- and if that fails it falls back on ``bs4`` + ``html5lib``.
744+ flavor : str or None , container of strings
745+ The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
746+ each other, they are both there for backwards compatibility. The
747+ default of ``None`` tries to use ``lxml`` to parse and if that fails it
748+ falls back on ``bs4`` + ``html5lib``.
740749
741- header : int or array -like, optional, default ``0``
742- The row (or rows for a MultiIndex) to use to make the columns headers.
743- Note that this row will be removed from the data .
750+ header : int or list -like or None, optional
751+ The row (or list of rows for a :class:`~pandas. MultiIndex` ) to use to
752+ make the columns headers .
744753
745- index_col : int or array-like or None, optional, default ``None``
746- The column to use to make the index. Note that this column will be
747- removed from the data.
754+ index_col : int or list-like or None, optional
755+ The column (or list of columns) to use to create the index.
748756
749- skiprows : int or collections.Container or slice or None, optional, default ``None``
757+ skiprows : int or list-like or slice or None, optional
750758 If an integer is given then skip this many rows after parsing the
751759 column header. If a sequence of integers is given skip those specific
752760 rows (0-based). Note that
753761
754762 .. code-block:: python
755763
756- skiprows == 0
764+ pandas.read_html(..., skiprows=0)
757765
758766 yields the same result as
759767
760768 .. code-block:: python
761769
762- skiprows is None
770+ pandas.read_html(..., skiprows= None)
763771
764772 If `skiprows` is a positive integer, say :math:`n`, then
765773 it is treated as "skip :math:`n` rows", *not* as "skip the
766774 :math:`n^\textrm{th}` row".
767775
768- infer_types : bool or None , optional, default ``None`` , deprecated since 0.13, removed in 0.14
776+ infer_types : bool, optional, deprecated since 0.13, removed in 0.14
769777
770- attrs : dict or None, optional, default ``None``
778+ attrs : dict or None, optional
771779 This is a dictionary of attributes that you can pass to use to identify
772780 the table in the HTML. These are not checked for validity before being
773781 passed to lxml or Beautiful Soup. However, these attributes must be
@@ -793,51 +801,65 @@ def read_html(io, match='.+', flavor=None, header=0, index_col=None,
793801 <http://www.w3.org/TR/html-markup/table.html>`__. It contains the
794802 latest information on table attributes for the modern web.
795803
804+ parse_dates : bool, optional
805+ See :func:`~pandas.read_csv` for details.
806+
807+ tupleize_cols : bool, optional
808+ If ``False`` try to parse multiple header rows into a
809+ :class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more
810+ details. Defaults to ``False`` for backwards compatibility. This is in
811+ contrast to other IO functions which default to ``True``.
812+
813+ thousands : str, optional
814+ Separator to use to parse thousands. Defaults to ``','``. Note that
815+ this is different from :func:`~pandas.read_csv` because
816+ :func:`~pandas.read_csv` must be able to parse different separators,
817+ and the default separator is ``','``. :func:`~pandas.read_html` does
818+ not need to do this, so it defaults to ``','``.
819+
796820 Returns
797821 -------
798822 dfs : list of DataFrames
799- A list of DataFrames, each of which is the parsed data from each of the
800- tables on the page.
801823
802824 Notes
803825 -----
804- Before using this function you should probably read the :ref:`gotchas about
805- the parser libraries that this function uses <html-gotchas>`.
806-
807- There's as little cleaning of the data as possible due to the heterogeneity
808- and general disorder of HTML on the web.
826+ Before using this function you should read the :ref:`gotchas about the
827+ HTML parsing libraries <html-gotchas>`.
809828
810- Expect some cleanup after you call this function. For example,
811- you might need to pass `infer_types=False` and perform manual conversion if
812- the column names are converted to NaN when you pass the `header=0`
813- argument. We try to assume as little as possible about the structure of the
814- table and push the idiosyncrasies of the HTML contained in the table to
815- you, the user.
829+ Expect to do some cleanup after you call this function. For example, you
830+ might need to manually assign column names if the column names are
831+ converted to NaN when you pass the `header=0` argument. We try to assume as
832+ little as possible about the structure of the table and push the
833+ idiosyncrasies of the HTML contained in the table to the user.
816834
817- This function only searches for <table> elements and only for <tr> and <th>
818- rows and <td> elements within those rows. This could be extended by
819- subclassing one of the parser classes contained in :mod:`pandas.io.html` .
835+ This function searches for `` <table>`` elements and only for `` <tr>``
836+ and ``<th>`` rows and `` <td>`` elements within each ``<tr>`` or ``<th>``
837+ element in the table. ``<td>`` stands for "table data" .
820838
821- Similar to :func:`read_csv` the `header` argument is applied **after**
822- `skiprows` is applied.
839+ Similar to :func:`~pandas. read_csv` the `header` argument is applied
840+ **after** `skiprows` is applied.
823841
824842 This function will *always* return a list of :class:`DataFrame` *or*
825843 it will fail, e.g., it will *not* return an empty list.
826844
827845 Examples
828846 --------
829847 See the :ref:`read_html documentation in the IO section of the docs
830- <io.read_html>` for many examples of reading HTML.
848+ <io.read_html>` for some examples of reading in HTML tables.
849+
850+ See Also
851+ --------
852+ pandas.read_csv
831853 """
832- # Type check here. We don't want to parse only to fail because of an
833- # invalid value of an integer skiprows.
834854 if infer_types is not None :
835- warnings .warn ("infer_types will be removed in 0.14" , UserWarning )
855+ warnings .warn ("infer_types will be removed in 0.14" )
836856 else :
837- infer_types = True # remove in 0.14
857+ infer_types = True # TODO: remove in 0.14
838858
859+ # Type check here. We don't want to parse only to fail because of an
860+ # invalid value of an integer skiprows.
839861 if isinstance (skiprows , numbers .Integral ) and skiprows < 0 :
840862 raise AssertionError ('cannot skip rows starting from the end of the '
841863 'data (you passed a negative value)' )
842864 return _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
843- parse_dates , attrs )
865+ parse_dates , tupleize_cols , thousands , attrs )
0 commit comments