from lxml import html
import re

class CustomTreeNode:
    def __init__(self, tag, text=None, children=None):
        self.tag = tag
        self.text = text
        self.children = children if children else []

def build_template_tree(template_html):
    template_tree = html.fromstring(template_html)
    return build_custom_tree(template_tree)

def build_custom_tree(template_tree):
    root = CustomTreeNode(template_tree.tag, text=template_tree.text)
    for child in template_tree:
        root.children.append(build_custom_tree(child))
    return root

def extract_placeholder_names(template_tree):
    placeholders = set()

    def traverse_tree(node):
        if node.text and "{{." in node.text:
            placeholder_name = re.search(r'\{\{\.(\w+)\}\}', node.text).group(1)
            placeholders.add(placeholder_name)
        for child in node.children:
            traverse_tree(child)

    traverse_tree(template_tree)
    return placeholders

def find_matching_elements(html_node, template_node, matching_elements):
    if template_node.text and "{{." in template_node.text:
        placeholder_name = re.search(r'\{\{\.(\w+)\}\}', template_node.text).group(1)
        matching_elements[placeholder_name] = html_node.text
    for html_child, template_child in zip(html_node, template_node.children):
        find_matching_elements(html_child, template_child, matching_elements)

if __name__ == "__main__":
    template_html = """
            <span id="p1">{{.placeholder1}}</span>
    """

    actual_html = """
           <div>This is the content.</div>
            <div id="p1">first p text</div>
            <div>Some other content.</div>
            <span id="p2">second p text</span>
        </html>
    """

    template_tree = build_template_tree(template_html)
    placeholder_names = extract_placeholder_names(template_tree)

    actual_tree = html.fromstring(actual_html)
    matching_elements = {}
    find_matching_elements(actual_tree, template_tree, matching_elements)

    result = {placeholder: matching_elements[placeholder] for placeholder in placeholder_names}
    print(result)

Изменить пасту