from lxml import html import re class CustomTreeNode: def __init__(self, tag, text=None, children=None): self.tag = tag self.text = text self.children = children if children else [] def build_template_tree(template_html): template_tree = html.fromstring(template_html) return build_custom_tree(template_tree) def build_custom_tree(template_tree): root = CustomTreeNode(template_tree.tag, text=template_tree.text) for child in template_tree: root.children.append(build_custom_tree(child)) return root def extract_placeholder_names(template_tree): placeholders = set() def traverse_tree(node): if node.text and "{{." in node.text: placeholder_name = re.search(r'\{\{\.(\w+)\}\}', node.text).group(1) placeholders.add(placeholder_name) for child in node.children: traverse_tree(child) traverse_tree(template_tree) return placeholders def find_matching_elements(html_node, template_node, matching_elements): if template_node.text and "{{." in template_node.text: placeholder_name = re.search(r'\{\{\.(\w+)\}\}', template_node.text).group(1) matching_elements[placeholder_name] = html_node.text for html_child, template_child in zip(html_node, template_node.children): find_matching_elements(html_child, template_child, matching_elements) if __name__ == "__main__": template_html = """ <span id="p1">{{.placeholder1}}</span> """ actual_html = """ <div>This is the content.</div> <div id="p1">first p text</div> <div>Some other content.</div> <span id="p2">second p text</span> </html> """ template_tree = build_template_tree(template_html) placeholder_names = extract_placeholder_names(template_tree) actual_tree = html.fromstring(actual_html) matching_elements = {} find_matching_elements(actual_tree, template_tree, matching_elements) result = {placeholder: matching_elements[placeholder] for placeholder in placeholder_names} print(result)