webleaf.Web

  1from .Leaf import Leaf
  2from .model.WebGraphAutoEncoder import WebGraphAutoEncoder
  3from lxml import etree
  4from lxml.cssselect import CSSSelector
  5
  6# Global variable to hold the WebGraphAutoEncoder instance
  7web_graph_auto_encoder = None
  8
  9
 10class Web:
 11    """
 12    The Web class provides an interface for parsing and interacting with HTML content.
 13    It uses the WebGraphAutoEncoder to convert the HTML structure into graph-based embeddings,
 14    allowing for extraction and comparison of elements.
 15
 16    Attributes:
 17    -----------
 18    html : str
 19        The raw HTML content passed during initialization.
 20    tree : lxml.etree.ElementTree
 21        Parsed HTML tree.
 22    features : list
 23        The encoded feature vectors for each HTML element.
 24    paths : list
 25        The XPath for each HTML element in the document.
 26    leaves : list of Leaf
 27        Leaf objects representing the elements of the HTML tree, based on their embeddings.
 28    path_leaves : dict
 29        A dictionary mapping XPaths to their corresponding Leaf objects.
 30    """
 31    def __init__(self, html: str):
 32        """
 33        Initialize the Web object by parsing the provided HTML and encoding it into embeddings.
 34
 35        Parameters:
 36        -----------
 37        html : str
 38            The HTML content to be parsed and encoded.
 39
 40        Raises:
 41        -------
 42        AssertionError if the HTML content is invalid.
 43        """
 44        global web_graph_auto_encoder
 45        self.html = html
 46        if not web_graph_auto_encoder:
 47            web_graph_auto_encoder = WebGraphAutoEncoder()
 48        self.tree = etree.ElementTree(etree.HTML(html))
 49        self.features, self.paths = web_graph_auto_encoder.extract(self.tree)
 50        self.leaves = [Leaf(feat) for feat in self.features]
 51        self.path_leaves = {path: leaf for path, leaf in zip(self.paths, self.leaves)}
 52
 53    def leaf(self, xpath: str = "", css_select: str = "") -> Leaf:
 54        """
 55        Extracts a specific element from the HTML tree based on an XPath or CSS selector
 56        and returns it as a Leaf object.
 57
 58        Parameters:
 59        -----------
 60        xpath : str, optional
 61            The XPath query to locate the element. (default is "")
 62        css_select : str, optional
 63            A CSS selector to locate the element. If provided, it is converted to XPath. (default is "")
 64
 65        Returns:
 66        --------
 67        Leaf
 68            The corresponding Leaf object for the element.
 69
 70        Raises:
 71        -------
 72        AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found.
 73        """
 74        if css_select:
 75            xpath = CSSSelector(css_select).path
 76        assert xpath, "When creating a WebLeaf please provide either a xpath or css selector."
 77        elements = self.tree.xpath(xpath)
 78        assert len(elements), f"Could not find elements at xpath [{xpath}] in html."
 79        path = self.tree.getpath(elements[0])
 80        assert path in self.path_leaves, f"The element at [{path}] was not processed by webleaf."
 81        return self.path_leaves[path]
 82
 83    def find(self, leaf: Leaf):
 84        """
 85         Finds the closest matching element in the HTML tree for the given Leaf object.
 86
 87         Parameters:
 88         -----------
 89         leaf : Leaf
 90             The Leaf object to be matched against the elements in the HTML tree.
 91
 92         Returns:
 93         --------
 94         str
 95             The XPath of the closest matching element.
 96         """
 97        return self.find_n(leaf, 1)[0]
 98
 99    def find_n(self, leaf: Leaf, n):
100        """
101        Finds the top N most similar elements to the given Leaf object,
102        based on their embeddings' distance.
103
104        Parameters:
105        -----------
106        leaf : Leaf
107            The Leaf object to compare against other elements in the tree.
108        n : int
109            The number of top similar elements to return.
110
111        Returns:
112        --------
113        list of str
114            A list of XPaths corresponding to the top N most similar elements.
115        """
116        path_dists = [(path, leaf.mdist(other)) for other, path in zip(self.leaves, self.paths)]
117        sorted_path_dists = sorted(path_dists, key=lambda path_dist: path_dist[1])
118        sorted_paths = [path for path, dist in sorted_path_dists]
119        return sorted_paths[:n]
web_graph_auto_encoder = None
class Web:
 11class Web:
 12    """
 13    The Web class provides an interface for parsing and interacting with HTML content.
 14    It uses the WebGraphAutoEncoder to convert the HTML structure into graph-based embeddings,
 15    allowing for extraction and comparison of elements.
 16
 17    Attributes:
 18    -----------
 19    html : str
 20        The raw HTML content passed during initialization.
 21    tree : lxml.etree.ElementTree
 22        Parsed HTML tree.
 23    features : list
 24        The encoded feature vectors for each HTML element.
 25    paths : list
 26        The XPath for each HTML element in the document.
 27    leaves : list of Leaf
 28        Leaf objects representing the elements of the HTML tree, based on their embeddings.
 29    path_leaves : dict
 30        A dictionary mapping XPaths to their corresponding Leaf objects.
 31    """
 32    def __init__(self, html: str):
 33        """
 34        Initialize the Web object by parsing the provided HTML and encoding it into embeddings.
 35
 36        Parameters:
 37        -----------
 38        html : str
 39            The HTML content to be parsed and encoded.
 40
 41        Raises:
 42        -------
 43        AssertionError if the HTML content is invalid.
 44        """
 45        global web_graph_auto_encoder
 46        self.html = html
 47        if not web_graph_auto_encoder:
 48            web_graph_auto_encoder = WebGraphAutoEncoder()
 49        self.tree = etree.ElementTree(etree.HTML(html))
 50        self.features, self.paths = web_graph_auto_encoder.extract(self.tree)
 51        self.leaves = [Leaf(feat) for feat in self.features]
 52        self.path_leaves = {path: leaf for path, leaf in zip(self.paths, self.leaves)}
 53
 54    def leaf(self, xpath: str = "", css_select: str = "") -> Leaf:
 55        """
 56        Extracts a specific element from the HTML tree based on an XPath or CSS selector
 57        and returns it as a Leaf object.
 58
 59        Parameters:
 60        -----------
 61        xpath : str, optional
 62            The XPath query to locate the element. (default is "")
 63        css_select : str, optional
 64            A CSS selector to locate the element. If provided, it is converted to XPath. (default is "")
 65
 66        Returns:
 67        --------
 68        Leaf
 69            The corresponding Leaf object for the element.
 70
 71        Raises:
 72        -------
 73        AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found.
 74        """
 75        if css_select:
 76            xpath = CSSSelector(css_select).path
 77        assert xpath, "When creating a WebLeaf please provide either a xpath or css selector."
 78        elements = self.tree.xpath(xpath)
 79        assert len(elements), f"Could not find elements at xpath [{xpath}] in html."
 80        path = self.tree.getpath(elements[0])
 81        assert path in self.path_leaves, f"The element at [{path}] was not processed by webleaf."
 82        return self.path_leaves[path]
 83
 84    def find(self, leaf: Leaf):
 85        """
 86         Finds the closest matching element in the HTML tree for the given Leaf object.
 87
 88         Parameters:
 89         -----------
 90         leaf : Leaf
 91             The Leaf object to be matched against the elements in the HTML tree.
 92
 93         Returns:
 94         --------
 95         str
 96             The XPath of the closest matching element.
 97         """
 98        return self.find_n(leaf, 1)[0]
 99
100    def find_n(self, leaf: Leaf, n):
101        """
102        Finds the top N most similar elements to the given Leaf object,
103        based on their embeddings' distance.
104
105        Parameters:
106        -----------
107        leaf : Leaf
108            The Leaf object to compare against other elements in the tree.
109        n : int
110            The number of top similar elements to return.
111
112        Returns:
113        --------
114        list of str
115            A list of XPaths corresponding to the top N most similar elements.
116        """
117        path_dists = [(path, leaf.mdist(other)) for other, path in zip(self.leaves, self.paths)]
118        sorted_path_dists = sorted(path_dists, key=lambda path_dist: path_dist[1])
119        sorted_paths = [path for path, dist in sorted_path_dists]
120        return sorted_paths[:n]

The Web class provides an interface for parsing and interacting with HTML content. It uses the WebGraphAutoEncoder to convert the HTML structure into graph-based embeddings, allowing for extraction and comparison of elements.

Attributes:

html : str The raw HTML content passed during initialization. tree : lxml.etree.ElementTree Parsed HTML tree. features : list The encoded feature vectors for each HTML element. paths : list The XPath for each HTML element in the document. leaves : list of Leaf Leaf objects representing the elements of the HTML tree, based on their embeddings. path_leaves : dict A dictionary mapping XPaths to their corresponding Leaf objects.

Web(html: str)
32    def __init__(self, html: str):
33        """
34        Initialize the Web object by parsing the provided HTML and encoding it into embeddings.
35
36        Parameters:
37        -----------
38        html : str
39            The HTML content to be parsed and encoded.
40
41        Raises:
42        -------
43        AssertionError if the HTML content is invalid.
44        """
45        global web_graph_auto_encoder
46        self.html = html
47        if not web_graph_auto_encoder:
48            web_graph_auto_encoder = WebGraphAutoEncoder()
49        self.tree = etree.ElementTree(etree.HTML(html))
50        self.features, self.paths = web_graph_auto_encoder.extract(self.tree)
51        self.leaves = [Leaf(feat) for feat in self.features]
52        self.path_leaves = {path: leaf for path, leaf in zip(self.paths, self.leaves)}

Initialize the Web object by parsing the provided HTML and encoding it into embeddings.

Parameters:

html : str The HTML content to be parsed and encoded.

Raises:

AssertionError if the HTML content is invalid.

html
tree
leaves
path_leaves
def leaf(self, xpath: str = '', css_select: str = '') -> webleaf.Leaf.Leaf:
54    def leaf(self, xpath: str = "", css_select: str = "") -> Leaf:
55        """
56        Extracts a specific element from the HTML tree based on an XPath or CSS selector
57        and returns it as a Leaf object.
58
59        Parameters:
60        -----------
61        xpath : str, optional
62            The XPath query to locate the element. (default is "")
63        css_select : str, optional
64            A CSS selector to locate the element. If provided, it is converted to XPath. (default is "")
65
66        Returns:
67        --------
68        Leaf
69            The corresponding Leaf object for the element.
70
71        Raises:
72        -------
73        AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found.
74        """
75        if css_select:
76            xpath = CSSSelector(css_select).path
77        assert xpath, "When creating a WebLeaf please provide either a xpath or css selector."
78        elements = self.tree.xpath(xpath)
79        assert len(elements), f"Could not find elements at xpath [{xpath}] in html."
80        path = self.tree.getpath(elements[0])
81        assert path in self.path_leaves, f"The element at [{path}] was not processed by webleaf."
82        return self.path_leaves[path]

Extracts a specific element from the HTML tree based on an XPath or CSS selector and returns it as a Leaf object.

Parameters:

xpath : str, optional The XPath query to locate the element. (default is "") css_select : str, optional A CSS selector to locate the element. If provided, it is converted to XPath. (default is "")

Returns:

Leaf The corresponding Leaf object for the element.

Raises:

AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found.

def find(self, leaf: webleaf.Leaf.Leaf):
84    def find(self, leaf: Leaf):
85        """
86         Finds the closest matching element in the HTML tree for the given Leaf object.
87
88         Parameters:
89         -----------
90         leaf : Leaf
91             The Leaf object to be matched against the elements in the HTML tree.
92
93         Returns:
94         --------
95         str
96             The XPath of the closest matching element.
97         """
98        return self.find_n(leaf, 1)[0]

Finds the closest matching element in the HTML tree for the given Leaf object.

Parameters:

leaf : Leaf The Leaf object to be matched against the elements in the HTML tree.

Returns:

str The XPath of the closest matching element.

def find_n(self, leaf: webleaf.Leaf.Leaf, n):
100    def find_n(self, leaf: Leaf, n):
101        """
102        Finds the top N most similar elements to the given Leaf object,
103        based on their embeddings' distance.
104
105        Parameters:
106        -----------
107        leaf : Leaf
108            The Leaf object to compare against other elements in the tree.
109        n : int
110            The number of top similar elements to return.
111
112        Returns:
113        --------
114        list of str
115            A list of XPaths corresponding to the top N most similar elements.
116        """
117        path_dists = [(path, leaf.mdist(other)) for other, path in zip(self.leaves, self.paths)]
118        sorted_path_dists = sorted(path_dists, key=lambda path_dist: path_dist[1])
119        sorted_paths = [path for path, dist in sorted_path_dists]
120        return sorted_paths[:n]

Finds the top N most similar elements to the given Leaf object, based on their embeddings' distance.

Parameters:

leaf : Leaf The Leaf object to compare against other elements in the tree. n : int The number of top similar elements to return.

Returns:

list of str A list of XPaths corresponding to the top N most similar elements.