webleaf.Web
1from .Leaf import Leaf 2from .model.WebGraphAutoEncoder import WebGraphAutoEncoder 3from lxml import etree 4from lxml.cssselect import CSSSelector 5 6# Global variable to hold the WebGraphAutoEncoder instance 7web_graph_auto_encoder = None 8 9 10class Web: 11 """ 12 The Web class provides an interface for parsing and interacting with HTML content. 13 It uses the WebGraphAutoEncoder to convert the HTML structure into graph-based embeddings, 14 allowing for extraction and comparison of elements. 15 16 Attributes: 17 ----------- 18 html : str 19 The raw HTML content passed during initialization. 20 tree : lxml.etree.ElementTree 21 Parsed HTML tree. 22 features : list 23 The encoded feature vectors for each HTML element. 24 paths : list 25 The XPath for each HTML element in the document. 26 leaves : list of Leaf 27 Leaf objects representing the elements of the HTML tree, based on their embeddings. 28 path_leaves : dict 29 A dictionary mapping XPaths to their corresponding Leaf objects. 30 """ 31 def __init__(self, html: str): 32 """ 33 Initialize the Web object by parsing the provided HTML and encoding it into embeddings. 34 35 Parameters: 36 ----------- 37 html : str 38 The HTML content to be parsed and encoded. 39 40 Raises: 41 ------- 42 AssertionError if the HTML content is invalid. 43 """ 44 global web_graph_auto_encoder 45 self.html = html 46 if not web_graph_auto_encoder: 47 web_graph_auto_encoder = WebGraphAutoEncoder() 48 self.tree = etree.ElementTree(etree.HTML(html)) 49 self.features, self.paths = web_graph_auto_encoder.extract(self.tree) 50 self.leaves = [Leaf(feat) for feat in self.features] 51 self.path_leaves = {path: leaf for path, leaf in zip(self.paths, self.leaves)} 52 53 def leaf(self, xpath: str = "", css_select: str = "") -> Leaf: 54 """ 55 Extracts a specific element from the HTML tree based on an XPath or CSS selector 56 and returns it as a Leaf object. 57 58 Parameters: 59 ----------- 60 xpath : str, optional 61 The XPath query to locate the element. (default is "") 62 css_select : str, optional 63 A CSS selector to locate the element. If provided, it is converted to XPath. (default is "") 64 65 Returns: 66 -------- 67 Leaf 68 The corresponding Leaf object for the element. 69 70 Raises: 71 ------- 72 AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found. 73 """ 74 if css_select: 75 xpath = CSSSelector(css_select).path 76 assert xpath, "When creating a WebLeaf please provide either a xpath or css selector." 77 elements = self.tree.xpath(xpath) 78 assert len(elements), f"Could not find elements at xpath [{xpath}] in html." 79 path = self.tree.getpath(elements[0]) 80 assert path in self.path_leaves, f"The element at [{path}] was not processed by webleaf." 81 return self.path_leaves[path] 82 83 def find(self, leaf: Leaf): 84 """ 85 Finds the closest matching element in the HTML tree for the given Leaf object. 86 87 Parameters: 88 ----------- 89 leaf : Leaf 90 The Leaf object to be matched against the elements in the HTML tree. 91 92 Returns: 93 -------- 94 str 95 The XPath of the closest matching element. 96 """ 97 return self.find_n(leaf, 1)[0] 98 99 def find_n(self, leaf: Leaf, n): 100 """ 101 Finds the top N most similar elements to the given Leaf object, 102 based on their embeddings' distance. 103 104 Parameters: 105 ----------- 106 leaf : Leaf 107 The Leaf object to compare against other elements in the tree. 108 n : int 109 The number of top similar elements to return. 110 111 Returns: 112 -------- 113 list of str 114 A list of XPaths corresponding to the top N most similar elements. 115 """ 116 path_dists = [(path, leaf.mdist(other)) for other, path in zip(self.leaves, self.paths)] 117 sorted_path_dists = sorted(path_dists, key=lambda path_dist: path_dist[1]) 118 sorted_paths = [path for path, dist in sorted_path_dists] 119 return sorted_paths[:n]
11class Web: 12 """ 13 The Web class provides an interface for parsing and interacting with HTML content. 14 It uses the WebGraphAutoEncoder to convert the HTML structure into graph-based embeddings, 15 allowing for extraction and comparison of elements. 16 17 Attributes: 18 ----------- 19 html : str 20 The raw HTML content passed during initialization. 21 tree : lxml.etree.ElementTree 22 Parsed HTML tree. 23 features : list 24 The encoded feature vectors for each HTML element. 25 paths : list 26 The XPath for each HTML element in the document. 27 leaves : list of Leaf 28 Leaf objects representing the elements of the HTML tree, based on their embeddings. 29 path_leaves : dict 30 A dictionary mapping XPaths to their corresponding Leaf objects. 31 """ 32 def __init__(self, html: str): 33 """ 34 Initialize the Web object by parsing the provided HTML and encoding it into embeddings. 35 36 Parameters: 37 ----------- 38 html : str 39 The HTML content to be parsed and encoded. 40 41 Raises: 42 ------- 43 AssertionError if the HTML content is invalid. 44 """ 45 global web_graph_auto_encoder 46 self.html = html 47 if not web_graph_auto_encoder: 48 web_graph_auto_encoder = WebGraphAutoEncoder() 49 self.tree = etree.ElementTree(etree.HTML(html)) 50 self.features, self.paths = web_graph_auto_encoder.extract(self.tree) 51 self.leaves = [Leaf(feat) for feat in self.features] 52 self.path_leaves = {path: leaf for path, leaf in zip(self.paths, self.leaves)} 53 54 def leaf(self, xpath: str = "", css_select: str = "") -> Leaf: 55 """ 56 Extracts a specific element from the HTML tree based on an XPath or CSS selector 57 and returns it as a Leaf object. 58 59 Parameters: 60 ----------- 61 xpath : str, optional 62 The XPath query to locate the element. (default is "") 63 css_select : str, optional 64 A CSS selector to locate the element. If provided, it is converted to XPath. (default is "") 65 66 Returns: 67 -------- 68 Leaf 69 The corresponding Leaf object for the element. 70 71 Raises: 72 ------- 73 AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found. 74 """ 75 if css_select: 76 xpath = CSSSelector(css_select).path 77 assert xpath, "When creating a WebLeaf please provide either a xpath or css selector." 78 elements = self.tree.xpath(xpath) 79 assert len(elements), f"Could not find elements at xpath [{xpath}] in html." 80 path = self.tree.getpath(elements[0]) 81 assert path in self.path_leaves, f"The element at [{path}] was not processed by webleaf." 82 return self.path_leaves[path] 83 84 def find(self, leaf: Leaf): 85 """ 86 Finds the closest matching element in the HTML tree for the given Leaf object. 87 88 Parameters: 89 ----------- 90 leaf : Leaf 91 The Leaf object to be matched against the elements in the HTML tree. 92 93 Returns: 94 -------- 95 str 96 The XPath of the closest matching element. 97 """ 98 return self.find_n(leaf, 1)[0] 99 100 def find_n(self, leaf: Leaf, n): 101 """ 102 Finds the top N most similar elements to the given Leaf object, 103 based on their embeddings' distance. 104 105 Parameters: 106 ----------- 107 leaf : Leaf 108 The Leaf object to compare against other elements in the tree. 109 n : int 110 The number of top similar elements to return. 111 112 Returns: 113 -------- 114 list of str 115 A list of XPaths corresponding to the top N most similar elements. 116 """ 117 path_dists = [(path, leaf.mdist(other)) for other, path in zip(self.leaves, self.paths)] 118 sorted_path_dists = sorted(path_dists, key=lambda path_dist: path_dist[1]) 119 sorted_paths = [path for path, dist in sorted_path_dists] 120 return sorted_paths[:n]
The Web class provides an interface for parsing and interacting with HTML content. It uses the WebGraphAutoEncoder to convert the HTML structure into graph-based embeddings, allowing for extraction and comparison of elements.
Attributes:
html : str The raw HTML content passed during initialization. tree : lxml.etree.ElementTree Parsed HTML tree. features : list The encoded feature vectors for each HTML element. paths : list The XPath for each HTML element in the document. leaves : list of Leaf Leaf objects representing the elements of the HTML tree, based on their embeddings. path_leaves : dict A dictionary mapping XPaths to their corresponding Leaf objects.
32 def __init__(self, html: str): 33 """ 34 Initialize the Web object by parsing the provided HTML and encoding it into embeddings. 35 36 Parameters: 37 ----------- 38 html : str 39 The HTML content to be parsed and encoded. 40 41 Raises: 42 ------- 43 AssertionError if the HTML content is invalid. 44 """ 45 global web_graph_auto_encoder 46 self.html = html 47 if not web_graph_auto_encoder: 48 web_graph_auto_encoder = WebGraphAutoEncoder() 49 self.tree = etree.ElementTree(etree.HTML(html)) 50 self.features, self.paths = web_graph_auto_encoder.extract(self.tree) 51 self.leaves = [Leaf(feat) for feat in self.features] 52 self.path_leaves = {path: leaf for path, leaf in zip(self.paths, self.leaves)}
Initialize the Web object by parsing the provided HTML and encoding it into embeddings.
Parameters:
html : str The HTML content to be parsed and encoded.
Raises:
AssertionError if the HTML content is invalid.
54 def leaf(self, xpath: str = "", css_select: str = "") -> Leaf: 55 """ 56 Extracts a specific element from the HTML tree based on an XPath or CSS selector 57 and returns it as a Leaf object. 58 59 Parameters: 60 ----------- 61 xpath : str, optional 62 The XPath query to locate the element. (default is "") 63 css_select : str, optional 64 A CSS selector to locate the element. If provided, it is converted to XPath. (default is "") 65 66 Returns: 67 -------- 68 Leaf 69 The corresponding Leaf object for the element. 70 71 Raises: 72 ------- 73 AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found. 74 """ 75 if css_select: 76 xpath = CSSSelector(css_select).path 77 assert xpath, "When creating a WebLeaf please provide either a xpath or css selector." 78 elements = self.tree.xpath(xpath) 79 assert len(elements), f"Could not find elements at xpath [{xpath}] in html." 80 path = self.tree.getpath(elements[0]) 81 assert path in self.path_leaves, f"The element at [{path}] was not processed by webleaf." 82 return self.path_leaves[path]
Extracts a specific element from the HTML tree based on an XPath or CSS selector and returns it as a Leaf object.
Parameters:
xpath : str, optional The XPath query to locate the element. (default is "") css_select : str, optional A CSS selector to locate the element. If provided, it is converted to XPath. (default is "")
Returns:
Leaf The corresponding Leaf object for the element.
Raises:
AssertionError if neither an XPath nor a CSS selector is provided, or if the element is not found.
84 def find(self, leaf: Leaf): 85 """ 86 Finds the closest matching element in the HTML tree for the given Leaf object. 87 88 Parameters: 89 ----------- 90 leaf : Leaf 91 The Leaf object to be matched against the elements in the HTML tree. 92 93 Returns: 94 -------- 95 str 96 The XPath of the closest matching element. 97 """ 98 return self.find_n(leaf, 1)[0]
Finds the closest matching element in the HTML tree for the given Leaf object.
Parameters:
leaf : Leaf The Leaf object to be matched against the elements in the HTML tree.
Returns:
str The XPath of the closest matching element.
100 def find_n(self, leaf: Leaf, n): 101 """ 102 Finds the top N most similar elements to the given Leaf object, 103 based on their embeddings' distance. 104 105 Parameters: 106 ----------- 107 leaf : Leaf 108 The Leaf object to compare against other elements in the tree. 109 n : int 110 The number of top similar elements to return. 111 112 Returns: 113 -------- 114 list of str 115 A list of XPaths corresponding to the top N most similar elements. 116 """ 117 path_dists = [(path, leaf.mdist(other)) for other, path in zip(self.leaves, self.paths)] 118 sorted_path_dists = sorted(path_dists, key=lambda path_dist: path_dist[1]) 119 sorted_paths = [path for path, dist in sorted_path_dists] 120 return sorted_paths[:n]
Finds the top N most similar elements to the given Leaf object, based on their embeddings' distance.
Parameters:
leaf : Leaf The Leaf object to compare against other elements in the tree. n : int The number of top similar elements to return.
Returns:
list of str A list of XPaths corresponding to the top N most similar elements.