Source code for dhtmlparser3.tags.tag

import html
import copy
from typing import Dict
from typing import List
from typing import Union
from typing import Iterator

from dhtmlparser3.quoter import escape
from dhtmlparser3.specialdict import SpecialDict
from dhtmlparser3.tags.comment import Comment


[docs]class Tag:
    """
    Attributes:
        name (str): Name of the parsed tag.
        parameters (SpecialDict): Dictionary for the parameters.
        content (list): List of sub-elements.
        parent (Tag): Reference to parent element.
    """

    _DICT_INSTANCE = SpecialDict
    _DONT_ESCAPE = {"style", "script"}
    _DONT_FORMAT = {"pre", "style", "script"}

    def __init__(self, name, parameters=None, content=None, is_non_pair=False):
        self.name = name

        if parameters is None:
            self.parameters = self._DICT_INSTANCE()
        elif isinstance(parameters, dict):
            self.parameters = self._DICT_INSTANCE(parameters)
        else:
            self.parameters = parameters

        self.content = content if content is not None else []

        self.is_non_pair = is_non_pair
        self.parent = None

        self._wfind_only_on_content = False

    @property
    def p(self) -> Dict[str, str]:
        """
        Shortcut for .parameters, used extensively in tests.
        """
        return self.parameters

    @property
    def c(self):
        """
        Shortcut for .content, used extensively in tests.
        """
        return self.content

    @property
    def tags(self) -> List["Tag"]:
        """
        Same as .c, but returns only tag instances. Useful for ignoring
        whitespace and comment clutter and iterating over the real dom structure.
        """
        return [x for x in self.content if isinstance(x, Tag)]

[docs]    def double_link(self):
        """
        Make the DOM hierarchy double-linked. Each content element now points
        to the parent element.
        """
        for item in self.content:
            if isinstance(item, Tag):
                item.parent = self
                item.double_link()

[docs]    def content_without_tags(self) -> str:
        """
        Return content but remove all tags.

        This is sometimes useful for processing messy websites.
        """
        output = ""
        for item in self.content:
            if isinstance(item, Tag):
                output += item.content_without_tags()
            elif isinstance(item, str):
                output += item

        return output

[docs]    def remove(self, offending_item: Union[str, "Tag", Comment]) -> bool:
        """
        Remove `offending_item` anywhere from the dom.

        Item is matched using `is` operator, so it better be something you've
        found using .find() or other relevant methods.

        Returns:
            bool: True if the item was found and removed.
        """
        for item in self.content:
            if item is offending_item:
                self.remove_item(offending_item)
                return True

            if isinstance(item, Tag) and item.remove(offending_item):
                return True

        return False

[docs]    def remove_item(self, item: Union[str, "Tag", Comment]):
        """
        Remove the item from the .content property.
        """
        if isinstance(item, str):
            self.content.remove(item)
        elif isinstance(item, Comment):
            self.content = [
                x for x in self.content if not (isinstance(x, Comment) and x is item)
            ]
        elif isinstance(item, Tag):
            self.content = [
                x for x in self.content if not (isinstance(x, Tag) and x is item)
            ]
        else:
            raise ValueError(f"Can't remove `{repr(item)}`")

[docs]    def to_string(self) -> str:
        """
        Get HTML representation of the tag and the content.
        """
        output = self.tag_to_str()

        escape_fn = html.escape
        if self.name in self._DONT_ESCAPE:
            escape_fn = lambda x: x

        for item in self.content:
            if isinstance(item, str):
                output += escape_fn(item)
            else:
                output += item.to_string()

        if self.name and not self.is_non_pair:
            return f"{output}</{self.name}>"

        return output

[docs]    def tag_to_str(self) -> str:
        """
        Convert just the tag with parameters to string, without content.
        """
        if not self.name:
            return ""

        if self.is_non_pair:
            return f"<{self.name}{self._parameters_to_str()} />"

        return f"<{self.name}{self._parameters_to_str()}>"

    def _parameters_to_str(self) -> str:
        if not self.parameters:
            return ""

        parameters = []
        for key, value in self.parameters.items():
            if value:
                parameters.append(f'{key}="{escape(str(value))}"')
            else:
                parameters.append(f"{key}")

        return " " + " ".join(parameters)

[docs]    def content_str(self, escape=False) -> str:
        """
        Return everything in between the tags as string.

        Args:
            escape (bool): Escape the content. Default False.
        """
        output = ""
        for item in self.content:
            if isinstance(item, str):
                if escape:
                    output += html.escape(item)
                else:
                    output += item
            else:
                output += item.to_string()

        return output

[docs]    def replace_with(self, item: "Tag", keep_content: bool = False):
        """
        Replace this Tag with another `item`.

        Args:
            item (Tag, str): Item to replace this with.
            keep_content (bool): Keep the original content. Default `False`.
        """
        if isinstance(item, str):
            unused_root_element = (
                self.parent.name == "" and len(self.parent.content) == 1
            )
            if self.parent and not unused_root_element:
                self_index = self.parent.content.index(self)
                self.parent.content[self_index] = item
            else:
                self.name = ""
                self.parameters.clear()
                self.is_non_pair = True
                self.content = [item]
        elif isinstance(item, Tag):
            self.name = item.name
            self.parameters = item.parameters.copy()
            if not keep_content:
                self.content = item.content[:]
            self.is_non_pair = item.is_non_pair
            self._wfind_only_on_content = item._wfind_only_on_content
        else:
            raise TypeError(f"Can't replace `item` with `{item.__class__}`!")

[docs]    def wfind(self, name, p=None, fn=None, case_sensitive=False):
        container = Tag(name="")
        container._wfind_only_on_content = True

        # in the first iteration, just do regular find
        if not self._wfind_only_on_content:
            container.content = self.find(name, p, fn, case_sensitive)
            return container

        # in the subsequent iterations, perform the matching on the sub-tags
        sub_tags = (item.content for item in self.content)
        for item in sum(sub_tags, []):  # flattern the list
            if isinstance(item, Tag):
                if item._is_almost_equal(name, p, fn, case_sensitive):
                    container.content.append(item)

        return container

[docs]    def match(self, *args):
        """
        Recursively call `find` for each element in `*args`. That means fuzzy
        matching, like "find all `<div>`s, which have this `<p>` element, which
        has this `<a>` in it.

        Example:
            dom.match("div", ["p", {"class": "great"}], "a")

        Args:
            *args (list): List of paths to match.

        Returns:
            list: List of matched elements.
        """
        item = self
        args = list(args)

        arg = args.pop(0)
        matched = self._call_find(arg)

        if not args:
            return matched

        next_matched = []
        while args:
            arg = args.pop(0)
            for item in matched:
                next_matched.extend(item._call_find(arg))

            matched = next_matched
            next_matched = []

        return matched

    def _call_find(self, arg):
        if isinstance(arg, dict):
            return self.find(**arg)
        elif isinstance(arg, list) or isinstance(arg, tuple):
            return self.find(*arg)
        else:
            return self.find(arg)

[docs]    def match_paths(self, *args):
        """
        Exactly match the path given by the arguments.

        Example:
            dom.match("body", ["div", {"class": "page-body"}], "p")

        This will match the path only if it really goes like this. If the `<p>`
        is for example wrapped in <div>, it won't be matched.

        Args:
            *args (list): List of paths to match.

        Returns:
            list: List of matched elements.
        """
        item = self
        args = list(args)
        while args:
            arg = args.pop(0)
            item = item._call_wfind(arg)

        return item.content

    def _call_wfind(self, arg):
        if isinstance(arg, dict):
            return self.wfind(**arg)
        elif isinstance(arg, list) or isinstance(arg, tuple):
            return self.wfind(*arg)
        else:
            return self.wfind(arg)

[docs]    def find(self, name, p=None, fn=None, case_sensitive=False) -> List["Tag"]:
        """
        Find (depth first) all tags with given parameters.

        Args:
            name (str): Name of the tag you are looking for. Use `""` for all.
            p (dict): Parameters to match.
            fn (lambda fn): Lambda expecting one argument.
             It will be tested for each element in the tree.
            case_sensitive (bool): Use case sensitive search. Default `False`.
        """
        return list(self.find_depth_first_iter(name, p, fn, case_sensitive))

[docs]    def findb(self, name, p=None, fn=None, case_sensitive=False) -> List["Tag"]:
        """
        Find (breadth first) all tags with given parameters.

        Args:
            name (str): Name of the tag you are looking for. Use `""` for all.
            p (dict): Parameters to match.
            fn (lambda fn): Lambda expecting one argument.
             It will be tested for each element in the tree.
            case_sensitive (bool): Use case sensitive search. Default `False`.
        """
        return list(self.find_breadth_first_iter(name, p, fn, case_sensitive))

[docs]    def find_depth_first_iter(
        self, name, p=None, fn=None, case_sensitive=False
    ) -> Iterator["Tag"]:
        for item in self.depth_first_iterator(tags_only=True):
            if item._is_almost_equal(name, p, fn, case_sensitive):
                yield item

[docs]    def find_breadth_first_iter(
        self, name, p=None, fn=None, case_sensitive=False
    ) -> Iterator["Tag"]:
        for item in self.breadth_first_iterator(tags_only=True):
            if item._is_almost_equal(name, p, fn, case_sensitive):
                yield item

[docs]    def depth_first_iterator(
        self, tags_only=False
    ) -> Iterator[Union["Tag", str, Comment]]:
        yield self

        for item in self.content:
            if isinstance(item, Tag):
                yield from item.depth_first_iterator(tags_only)
            elif not tags_only:
                yield item

[docs]    def breadth_first_iterator(
        self, tags_only=False, _first_call=True
    ) -> Iterator[Union["Tag", str, Comment]]:
        if _first_call:
            yield self

        if tags_only:
            for item in self.content:
                if isinstance(item, Tag):
                    yield item
        else:
            yield from self.content

        for item in self.content:
            if isinstance(item, Tag):
                yield from item.breadth_first_iterator(tags_only, False)

    def _is_almost_equal(
        self, other_name: str, p: dict = None, fn=None, case_sensitive=False
    ) -> bool:
        tag_name = self.name
        if not case_sensitive:
            tag_name = tag_name.lower()
            other_name = other_name.lower()

        if other_name and tag_name != other_name:
            return False

        if p is not None and not self._contains_parameters_subset(p):
            return False

        if fn is not None and not fn(self):
            return False

        return True

    def _contains_parameters_subset(self, parameter_subset):
        """
        Test whether this Tag contains at least all `parameter_subset`, key
        and values, or more.

        Args:
            params (dict/SpecialDict): Subset of parameters.

        Returns:
            bool: True if it is contained.
        """
        for key, val in parameter_subset.items():
            if not self.parameters or key not in self.parameters:
                return False

            if val != self.parameters[key]:
                return False

        return True

[docs]    def prettify(self, depth=0, dont_format=False) -> str:
        if self.name == "":
            return self._just_prettify_the_content()

        tag = self.tag_to_str()
        indent = depth * "  "

        if self.is_non_pair and not self.content:
            return f"{indent}{tag}\n"

        end_tag = "" if self.is_non_pair else f"</{self.name}>"

        if not dont_format and self.name in self._DONT_FORMAT:
            dont_format = True

        escape_fn = html.escape
        if self.name in self._DONT_ESCAPE:
            escape_fn = lambda x: x

        content = ""
        for item in self.content:
            if isinstance(item, str):
                if dont_format or item.strip():
                    content += escape_fn(item)
            else:
                content += item.prettify(depth + 1, dont_format=dont_format)

        if dont_format:
            return f"{tag}{content}{end_tag}\n"

        is_multiline = sum(1 for x in content.strip() if x == "\n") > 1
        if is_multiline:
            if content.endswith("\n"):
                return f"{indent}{tag}\n{content}{indent}{end_tag}\n"

            return f"{indent}{tag}\n{content}\n{indent}{end_tag}\n"

        if content.startswith("  ") and content.endswith("\n"):
            return f"{indent}{tag}\n{content}{indent}{end_tag}\n"

        return f"{indent}{tag}{content}{end_tag}\n"

    def _just_prettify_the_content(self):
        outputs = []

        escape_fn = html.escape
        if self.name in self._DONT_ESCAPE:
            escape_fn = lambda x: x

        for item in self.content:
            if isinstance(item, str):
                if item.strip():
                    outputs.append(escape_fn(item))
            else:
                outputs.append(item.prettify(0))

        return "\n".join(outputs)

    def __str__(self) -> str:
        return self.to_string()

    def __bytes__(self) -> bytes:
        return self.to_string().encode("utf-8")

    def __repr__(self) -> str:
        parameters = (
            f"{repr(self.name)}",
            f"parameters={repr(self.parameters)}",
            f"is_non_pair={self.is_non_pair}",
        )
        if self._wfind_only_on_content:
            parameters = ('name=""', f"content={repr(self.content)}")

        return f"{self.__class__.__name__}({', '.join(parameters)})"

    def __eq__(self, other):
        if not isinstance(other, Tag):
            return False

        if self.name != other.name:
            return False

        if self.parameters != other.parameters:
            return False

        if self.is_non_pair != other.is_non_pair:
            return False

        return True

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        rolling_hash = hash(self.tag_to_str())

        for item in self.content:
            rolling_hash ^= hash(item)

        return rolling_hash

    def __bool__(self):
        return bool(self.content)

    def __len__(self):
        return len(self.tags)

    def __getitem__(self, item):
        if isinstance(item, str):
            return self.parameters[item]
        else:
            return self.tags[item]

    def __setitem__(self, key, value):
        if isinstance(key, str):
            self.parameters[key] = str(value)
        elif isinstance(key, slice):  # used for inserting
            if key.start == -1:
                self.content.append(value)
            elif key.start == 0:
                self.content.insert(0, value)
            else:
                # use .tags as reference
                item = self.tags[key.start]
                index = self.content.index(item)
                self.content.insert(index, value)
        else:
            item = self.tags[key]
            index = self.content.index(item)
            self.content[index] = value

        if isinstance(value, Tag):
            value.parent = self

    def __contains__(self, item):
        if isinstance(item, str):
            return item in self.parameters
        else:
            return item in self.content

    def __delitem__(self, key):
        if isinstance(key, str):
            del self.parameters[key]
        else:
            self.remove_item(self.tags[key])

    def __iter__(self):
        return iter(self.tags)

    def __copy__(self):
        new_tag = Tag(self.name, self.parameters.copy(), self.content, self.is_non_pair)
        new_tag._wfind_only_on_content = self._wfind_only_on_content
        new_tag.parent = self.parent

        return new_tag

    def __deepcopy__(self, memodict={}):
        new_tag = Tag(self.name, self.parameters.copy(), is_non_pair=self.is_non_pair)
        new_tag._wfind_only_on_content = self._wfind_only_on_content

        new_tag.content = [copy.deepcopy(x, memodict) for x in self.content]
        for item in new_tag.content:
            if isinstance(item, Tag):
                item.parent = new_tag

        return new_tag