1- from html .parser import HTMLParser as _HTMLParser
2- from typing import Any , Callable , Dict , Generic , List , Optional , Tuple , TypeVar
1+ from itertools import chain
2+ from typing import Any , Callable , Generic , Iterable , List , TypeVar , Union
3+
4+ from lxml import etree
5+ from lxml .html import fragments_fromstring
6+
7+ import idom
8+ from idom .core .types import VdomDict
39
410
511_RefValue = TypeVar ("_RefValue" )
12+ _ModelTransform = Callable [[VdomDict ], Any ]
613_UNDEFINED : Any = object ()
714
815
@@ -49,11 +56,9 @@ def __repr__(self) -> str:
4956 return f"{ type (self ).__name__ } ({ current } )"
5057
5158
52- _ModelTransform = Callable [[Dict [str , Any ]], Any ]
53-
54-
55- def html_to_vdom (source : str , * transforms : _ModelTransform ) -> Dict [str , Any ]:
56- """Transform HTML into a DOM model
59+ def html_to_vdom (html : str , * transforms : _ModelTransform , strict : bool = True ) -> VdomDict :
60+ """Transform HTML into a DOM model. Unique keys can be provided to HTML elements
61+ using a ``key=...`` attribute within your HTML tag.
5762
5863 Parameters:
5964 source:
@@ -62,81 +67,154 @@ def html_to_vdom(source: str, *transforms: _ModelTransform) -> Dict[str, Any]:
6267 Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
6368 dictionary which will be replaced by ``new``. For example, you could use a
6469 transform function to add highlighting to a ``<code/>`` block.
70+ strict:
71+ If ``True``, raise an exception if the HTML does not perfectly follow HTML5
72+ syntax.
6573 """
66- parser = HtmlParser ()
67- parser .feed (source )
68- root = parser .model ()
69- to_visit = [root ]
70- while to_visit :
71- node = to_visit .pop (0 )
72- if isinstance (node , dict ) and "children" in node :
73- transformed = []
74- for child in node ["children" ]:
75- if isinstance (child , dict ):
76- for t in transforms :
77- child = t (child )
78- if child is not None :
79- transformed .append (child )
80- to_visit .append (child )
81- node ["children" ] = transformed
82- if "attributes" in node and not node ["attributes" ]:
83- del node ["attributes" ]
84- if "children" in node and not node ["children" ]:
85- del node ["children" ]
86- return root
87-
88-
89- class HtmlParser (_HTMLParser ):
90- """HTML to VDOM parser
91-
92- Example:
93-
94- .. code-block::
95-
96- parser = HtmlParser()
97-
98- parser.feed(an_html_string)
99- parser.feed(another_html_string)
100- ...
101-
102- vdom = parser.model()
74+ if not isinstance (html , str ): # pragma: no cover
75+ raise TypeError (f"Expected html to be a string, not { type (html ).__name__ } " )
76+
77+ # If the user provided a string, convert it to a list of lxml.etree nodes
78+ parser = etree .HTMLParser (
79+ remove_comments = True ,
80+ remove_pis = True ,
81+ remove_blank_text = True ,
82+ recover = not strict ,
83+ )
84+ try :
85+ nodes : List = fragments_fromstring (html , no_leading_text = True , parser = parser )
86+ except etree .XMLSyntaxError as e :
87+ if not strict :
88+ raise e # pragma: no cover
89+ raise HTMLParseError (
90+ "An error has occurred while parsing the HTML.\n \n "
91+ "This HTML may be malformatted, or may not perfectly adhere to HTML5.\n "
92+ "If you believe the exception above was due to something intentional, "
93+ "you can disable the strict parameter on html_to_vdom().\n "
94+ "Otherwise, repair your broken HTML and try again."
95+ ) from e
96+ has_root_node = len (nodes ) == 1
97+
98+ # Find or create a root node
99+ if has_root_node :
100+ root_node = nodes [0 ]
101+ else :
102+ # etree.Element requires a non-empty tag - we correct this below
103+ root_node = etree .Element ("TEMP" , None , None )
104+ for child in nodes :
105+ root_node .append (child )
106+
107+ # Convert the lxml node to a VDOM dict
108+ vdom = _etree_to_vdom (root_node , transforms )
109+
110+ # Change the artificially created root node to a React Fragment, instead of a div
111+ if not has_root_node :
112+ vdom ["tagName" ] = ""
113+
114+ return vdom
115+
116+
117+ def _etree_to_vdom (
118+ node : etree ._Element , transforms : Iterable [_ModelTransform ]
119+ ) -> VdomDict :
120+ """Recusively transform an lxml etree node into a DOM model
121+
122+ Parameters:
123+ source:
124+ The ``lxml.etree._Element`` node
125+ transforms:
126+ Functions of the form ``transform(old) -> new`` where ``old`` is a VDOM
127+ dictionary which will be replaced by ``new``. For example, you could use a
128+ transform function to add highlighting to a ``<code/>`` block.
103129 """
130+ if not isinstance (node , etree ._Element ): # pragma: no cover
131+ raise TypeError (
132+ f"Expected node to be a etree._Element, not { type (node ).__name__ } "
133+ )
104134
105- def model (self ) -> Dict [str , Any ]:
106- """Get the current state of parsed VDOM model"""
107- return self ._node_stack [0 ]
108-
109- def feed (self , data : str ) -> None :
110- """Feed in HTML that will update the :meth:`HtmlParser.model`"""
111- self ._node_stack .append (self ._make_vdom ("div" , {}))
112- super ().feed (data )
113-
114- def reset (self ) -> None :
115- """Reset the state of the parser"""
116- self ._node_stack : List [Dict [str , Any ]] = []
117- super ().reset ()
118-
119- def handle_starttag (self , tag : str , attrs : List [Tuple [str , Optional [str ]]]) -> None :
120- new = self ._make_vdom (tag , dict (attrs ))
121- current = self ._node_stack [- 1 ]
122- current ["children" ].append (new )
123- self ._node_stack .append (new )
124-
125- def handle_endtag (self , tag : str ) -> None :
126- del self ._node_stack [- 1 ]
127-
128- def handle_data (self , data : str ) -> None :
129- self ._node_stack [- 1 ]["children" ].append (data )
130-
131- @staticmethod
132- def _make_vdom (tag : str , attrs : Dict [str , Any ]) -> Dict [str , Any ]:
133- if "style" in attrs :
134- style = attrs ["style" ]
135- if isinstance (style , str ):
136- style_dict = {}
137- for k , v in (part .split (":" , 1 ) for part in style .split (";" ) if part ):
138- title_case_key = k .title ().replace ("-" , "" )
139- camel_case_key = title_case_key [:1 ].lower () + title_case_key [1 :]
140- style_dict [camel_case_key ] = v
141- attrs ["style" ] = style_dict
142- return {"tagName" : tag , "attributes" : attrs , "children" : []}
135+ # This will recursively call _etree_to_vdom() on all children
136+ children = _generate_vdom_children (node , transforms )
137+
138+ # Convert the lxml node to a VDOM dict
139+ attributes = dict (node .items ())
140+ key = attributes .pop ("key" , None )
141+
142+ if hasattr (idom .html , node .tag ):
143+ vdom = getattr (idom .html , node .tag )(attributes , * children , key = key )
144+ else :
145+ vdom : VdomDict = {"tagName" : node .tag }
146+ if children :
147+ vdom ["children" ] = children
148+ if attributes :
149+ vdom ["attributes" ] = attributes
150+ if key is not None :
151+ vdom ["key" ] = key
152+
153+ # Perform any necessary mutations on the VDOM attributes to meet VDOM spec
154+ _mutate_vdom (vdom )
155+
156+ # Apply any provided transforms.
157+ for transform in transforms :
158+ vdom = transform (vdom )
159+
160+ return vdom
161+
162+
163+ def _mutate_vdom (vdom : VdomDict ):
164+ """Performs any necessary mutations on the VDOM attributes to meet VDOM spec.
165+
166+ Currently, this function only transforms the ``style`` attribute into a dictionary whose keys are
167+ camelCase so as to be renderable by React.
168+
169+ This function may be extended in the future.
170+ """
171+ # Determine if the style attribute needs to be converted to a dict
172+ if (
173+ "attributes" in vdom
174+ and "style" in vdom ["attributes" ]
175+ and isinstance (vdom ["attributes" ]["style" ], str )
176+ ):
177+ # Convince type checker that it's safe to mutate attributes
178+ assert isinstance (vdom ["attributes" ], dict )
179+
180+ # Convert style attribute from str -> dict with camelCase keys
181+ vdom ["attributes" ]["style" ] = {
182+ _hypen_to_camel_case (key .strip ()): value .strip ()
183+ for key , value in (
184+ part .split (":" , 1 )
185+ for part in vdom ["attributes" ]["style" ].split (";" )
186+ if ":" in part
187+ )
188+ }
189+
190+
191+ def _generate_vdom_children (
192+ node : etree ._Element , transforms : Iterable [_ModelTransform ]
193+ ) -> List [Union [VdomDict , str ]]:
194+ """Generates a list of VDOM children from an lxml node.
195+
196+ Inserts inner text and/or tail text inbetween VDOM children, if necessary.
197+ """
198+ return ( # Get the inner text of the current node
199+ [node .text ] if node .text else []
200+ ) + list (
201+ chain (
202+ * (
203+ # Recursively convert each child node to VDOM
204+ [_etree_to_vdom (child , transforms )]
205+ # Insert the tail text between each child node
206+ + ([child .tail ] if child .tail else [])
207+ for child in node .iterchildren (None )
208+ )
209+ )
210+ )
211+
212+
213+ def _hypen_to_camel_case (string : str ) -> str :
214+ """Convert a hypenated string to camelCase."""
215+ first , _ , remainder = string .partition ("-" )
216+ return first .lower () + remainder .title ().replace ("-" , "" )
217+
218+
219+ class HTMLParseError (etree .LxmlSyntaxError ):
220+ """Raised when an HTML document cannot be parsed using strict parsing."""
0 commit comments