Source code for dhtmlparser3.tokenizer

from typing import List
from typing import Iterator

from dhtmlparser3.tokens import Token
from dhtmlparser3.tokens import TagToken
from dhtmlparser3.tokens import TextToken
from dhtmlparser3.tokens import EntityToken
from dhtmlparser3.tokens import CommentToken
from dhtmlparser3.tokens import ParameterToken


[docs]class Tokenizer: tokens: List[Token] MAX_ENTITY_LENGTH = 20 def __init__(self, string: str): self.string = string self.pointer = 0 self.buffer = "" self.char = string[0] if string else "" self.end = len(string) - 1
[docs] def tokenize(self) -> List[Token]: return list(self.tokenize_iter())
[docs] def tokenize_iter(self) -> Iterator[Token]: if self.end == 0: yield TextToken(self.string) return token = self._scan_token() new_tokens = [token] if isinstance(token, EntityToken): new_tokens = [TextToken(token.to_text())] while not self.is_at_end(): token = self._scan_token() if isinstance(token, EntityToken): token = TextToken(token.to_text()) if isinstance(new_tokens[-1], TextToken) and isinstance(token, TextToken): new_tokens[-1].content += token.content continue yield from new_tokens new_tokens.clear() new_tokens.append(token) if new_tokens: yield from new_tokens
def _scan_token(self): if self.char == "<": pointer = self.pointer try: return self._consume_tag() except IOError: self.buffer = "" return TextToken(self.string[pointer:self.pointer]) elif self.char == "&": return self._consume_entity() else: return self._consume_text() def _consume_tag(self): self.advance() # consume < self._consume_whitespaces() is_end_tag = False if self.char == "/": is_end_tag = True self.advance() if self.char == ">": self.advance() # consume > return TextToken("<>") if self.char == "!" and self.peek_is("-") and self.peek_two_is("-"): return self._consume_comment() tag = TagToken(self._consume_tag_name(), is_end_tag=is_end_tag) while not self.is_at_end(): self._consume_whitespaces() if self.char == ">": self.advance() # consume > return tag elif self.char == "<": raise IOError("New tag start.") parameter_name = self._consume_parameter_name() self._consume_whitespaces() if self.char == "/": self.advance() if parameter_name: tag.parameters.append(ParameterToken(parameter_name)) tag.is_non_pair = True continue elif self.char == ">": tag.parameters.append(ParameterToken(parameter_name)) continue elif self.char == "=": self.advance() self._consume_whitespaces() parameter_value = self._consume_parameter_value() tag.parameters.append(ParameterToken(parameter_name, parameter_value)) continue raise IOError("End of string while parsing tag!") def _consume_whitespaces(self): if self.char != " " and self.char != "\t" and self.char != "\n": return while not self.is_at_end(): if self.char != " " and self.char != "\t" and self.char != "\n": return self.advance() def _consume_tag_name(self): self.buffer = self.char while not self.is_at_end(): if self.peek() in "> \n\t</": self.advance() # move to the > or " " return self.return_reset_buffer() self.buffer += self.advance() raise IOError("End of string while parsing tag name!") def _consume_parameter_name(self): if self.char == "/": return self.buffer = self.char while not self.is_at_end(): peek = self.peek() if peek in " <=/>\t\n": self.advance() return self.return_reset_buffer() self.buffer += self.advance() raise IOError("End of string while paring parameter name!") def _consume_parameter_value(self): if self.char == '"' or self.char == "'": return self._consume_quoted_parameter_value() self.buffer = self.char while not self.is_at_end(): peek = self.peek() if peek in " </>'\"\t\n": if peek == "'" or peek == '"': self.advance() self.advance() return self.return_reset_buffer() self.buffer += self.advance() raise IOError("End of string while parsing parameter value!") def _consume_quoted_parameter_value(self): quote_type = self.char self.advance() if self.char == quote_type: self.advance() return "" while not self.is_at_end(): if self.char == quote_type: self.advance() return self.return_reset_buffer() if self.char == "&": buffer = self.buffer buffer += self._consume_entity().to_text() self.buffer = buffer continue self.buffer += self.char self.advance() raise IOError("End of string while parsing parameter value!") def _consume_comment(self): self.advance() # consume ! self.advance() # consume - self.buffer = "" while not self.is_at_end(): char = self.advance() if char == "-" and self.peek_is("-") and self.peek_two_is(">"): self.advance() # consume - self.advance() # consume - self.advance() # consume > return CommentToken(self.return_reset_buffer()) self.buffer += char return TextToken(f"<!--{self.return_reset_buffer()}") def _consume_entity(self): length = 0 self.buffer = self.char while not self.is_at_end(): char = self.advance() length += 1 if char == " ": return TextToken(self.return_reset_buffer()) if length > self.MAX_ENTITY_LENGTH: return TextToken(self.return_reset_buffer()) self.buffer += char if char == ";": if self.buffer != "&;": if not self.is_at_end(): self.advance() return EntityToken(self.return_reset_buffer()) return TextToken(self.return_reset_buffer()) if self.buffer: return TextToken(self.return_reset_buffer()) def _consume_text(self): self.buffer += self.char while not self.is_at_end(): char = self.advance() if char == "<" or char == "&": return TextToken(self.return_reset_buffer()) self.buffer += char return TextToken(self.return_reset_buffer())
[docs] def return_reset_buffer(self): buffer = self.buffer self.buffer = "" return buffer
[docs] def advance(self): self.pointer += 1 if self.pointer > self.end: return "" self.char = self.string[self.pointer] return self.char
[docs] def is_at_end(self): return self.pointer > self.end
[docs] def peek_is(self, char): next_char = self.peek() return char == next_char
[docs] def peek(self): if self.pointer < self.end: return self.string[self.pointer + 1] return ""
[docs] def peek_two_is(self, char): next_char = self.peek_two() return char == next_char
[docs] def peek_two(self): if (self.pointer + 1) < self.end: return self.string[self.pointer + 2] return ""