Source code for dhtmlparser3.parser

import gc

from dhtmlparser3.tokens import TextToken
from dhtmlparser3.tokens import CommentToken
from dhtmlparser3.tokenizer import Tokenizer
from dhtmlparser3.specialdict import SpecialDict

from dhtmlparser3.tags.tag import Tag
from dhtmlparser3.tags.comment import Comment


[docs]class Parser: NONPAIR_TAGS = { "br", "hr", "img", "input", # "link", "meta", "spacer", "frame", "base", } def __init__(self, string: str, case_insensitive_parameters=True): # remove UTF BOM (prettify fails if not) if len(string) > 3 and string[:3] == "\xef\xbb\xbf": string = string[3:] if case_insensitive_parameters: Tag._DICT_INSTANCE = SpecialDict else: Tag._DICT_INSTANCE = dict self.tokenizer = Tokenizer(string)
[docs] def parse_dom(self) -> Tag: gc.disable() root_elem = Tag("") top_element = root_elem element_stack = [root_elem] for token in self.tokenizer.tokenize_iter(): if isinstance(token, TextToken): top_element.content.append(token.content) continue elif isinstance(token, CommentToken): top_element.content.append(Comment(token.content)) continue elif token.is_non_pair: tag = token.to_tag() tag.parent = top_element top_element.content.append(tag) continue elif token.is_end_tag: closed_element = [ x for x in reversed(element_stack) if x.name == token.name ] # random closing tag which doesn't match anything if not closed_element: continue closed_element = closed_element[0] # correctly closed element on top of the stack if closed_element is top_element: element_stack.pop() top_element = element_stack[-1] continue top_element = self._reshape_non_pair_tags(element_stack, closed_element) continue new_top_element = token.to_tag() top_element.content.append(new_top_element) new_top_element.parent = top_element element_stack.append(new_top_element) top_element = new_top_element if len(element_stack) > 1: self._reshape_non_pair_tags(element_stack, root_elem) gc.enable() if len(root_elem.content) == 1 and isinstance(root_elem.content[0], Tag): return root_elem.content[0] return root_elem
def _reshape_non_pair_tags(self, element_stack, closed_element): """ Used for non_pair tags, which are parsed like this: "<div><br><img><hr></div>" gets parsed to: <div> <br> <img> <hr> What this does is that it changes the structure into: <div> <br> <img> <hr> """ # find which one was closed and treat all others as nonpair closed_element_index = element_stack.index(closed_element) + 1 non_pairs = element_stack[closed_element_index:] new_element_stack = element_stack[:closed_element_index] element_stack.clear() element_stack.extend(new_element_stack) # create list of (element, parent) from the non_pairs shifted_non_pairs = non_pairs[:] shifted_non_pairs.pop() shifted_non_pairs.insert(0, element_stack[-1]) for npt, parent in reversed(list(zip(non_pairs, shifted_non_pairs))): self._move_content_to_parent(npt, parent) npt.is_non_pair = True npt.parent = closed_element if element_stack: element_stack.pop() if element_stack: return element_stack[-1] return closed_element def _move_content_to_parent(self, non_pair_tag: Tag, parent: Tag): """ Take `.content` from `non_pair_tag` and move them to `parent` tag. """ if not non_pair_tag.content: return try: npt_index_in_parent = parent.content.index(non_pair_tag) except ValueError: npt_index_in_parent = 0 for sub_tag in reversed(non_pair_tag.content): parent.content.insert(npt_index_in_parent + 1, sub_tag) non_pair_tag.content.clear()