import xml.etree.ElementTree as ET
import os
import sys
import html
import re
import urllib.request
from urllib.parse import urlparse, quote
import socket
import unicodedata
import hashlib
import shutil

# =========================================================
# CONFIGURATION
# =========================================================

NETWORK_TIMEOUT = 10
socket.setdefaulttimeout(NETWORK_TIMEOUT)

NS = {
    "content": "http://purl.org/rss/1.0/modules/content/",
    "wp": "http://wordpress.org/export/1.2/"
}

ASSETS_URLS = {
    "highlight.min.css": "https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github-dark.min.css",
    "highlight.min.js": "https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js",
    "highlightjs-line-numbers.min.js": "https://cdnjs.cloudflare.com/ajax/libs/highlightjs-line-numbers.js/2.8.0/highlightjs-line-numbers.min.js"
}

# 1. Liste étendue des extensions à localiser
ATTACHMENT_EXTENSIONS = [
    '.zip', '.pdf', '.rar', '.7z',             # Archives et docs
    '.docx', '.xlsx', '.pptx', '.odt',        # Bureautique
    '.mp3', '.ogg', '.wav',                   # Audio
    '.mp4', '.webm', '.mov'                   # Vidéo
]

HTML_TEMPLATE = """<!DOCTYPE html>
<html lang="fr">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>{title}</title>
    <link rel="stylesheet" href="../assets/highlight.min.css">
    <style>
        /* Base Styles (Dark Mode Default) */
        body {{ font-family: -apple-system, sans-serif; line-height: 1.6; max-width: 900px; margin: auto; padding: 20px; background: #1a1a1a; color: #e0e0e0; transition: background 0.3s, color 0.3s; }}
        article {{ background: #2d2d2d; padding: 30px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.3); }}
        
        /* Light Mode */
        body.light-mode {{ background: #f4f4f4; color: #333; }}
        body.light-mode article {{ background: white; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }}
        body.light-mode h1 {{ color: #333; border-bottom-color: #eee; }}
        body.light-mode .date {{ color: #888; }}
        body.light-mode pre {{ background: #1e1e1e !important; }}
        
        /* Code Block Styling */
        pre {{ position: relative; background: #1e1e1e !important; padding: 10px; border-radius: 5px; overflow-x: auto; margin: 20px 0; }}
        code {{ font-family: 'Consolas', 'Monaco', monospace; font-size: 14px; background: transparent !important; padding: 0 !important; }}
        
        h1 {{ color: #e0e0e0; border-bottom: 2px solid #444; padding-bottom: 10px; }}
        .date {{ color: #aaa; font-size: 0.9em; }}
        img {{ max-width: 100%; height: auto; display: block; margin: 10px 0; }}

        /* Copy Button */
        .copy-button {{
            position: absolute;
            top: 5px;
            right: 5px;
            background: rgba(255, 255, 255, 0.1);
            border: 1px solid rgba(255, 255, 255, 0.2);
            border-radius: 4px;
            color: #ddd;
            font-size: 12px;
            padding: 4px 8px;
            cursor: pointer;
            transition: all 0.2s;
            z-index: 10;
        }}
        .copy-button:hover {{ background: rgba(255, 255, 255, 0.2); color: white; }}

        /* Line Numbers (Plugin) */
        .hljs-ln-numbers {{
            user-select: none;
            text-align: right;
            color: #666;
            border-right: 1px solid #444;
            vertical-align: top;
            padding-right: 15px !important;
            min-width: 30px;
        }}
        .hljs-ln-code {{
            padding-left: 15px !important;
        }}
        
        /* Dark Mode Toggle Button */
        .theme-toggle {{
            position: fixed;
            top: 20px;
            right: 20px;
            background: rgba(255, 255, 255, 0.1);
            border: 1px solid rgba(255, 255, 255, 0.2);
            border-radius: 50%;
            width: 50px;
            height: 50px;
            cursor: pointer;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 24px;
            transition: all 0.3s;
            z-index: 1000;
        }}
        .theme-toggle:hover {{ background: rgba(255, 255, 255, 0.2); }}
        
        /* Back to Top Button */
        .back-to-top {{
            position: fixed;
            bottom: 30px;
            right: 30px;
            background: rgba(0, 123, 255, 0.8);
            color: white;
            border: none;
            border-radius: 50%;
            width: 50px;
            height: 50px;
            cursor: pointer;
            display: none;
            align-items: center;
            justify-content: center;
            font-size: 24px;
            transition: all 0.3s;
            z-index: 999;
        }}
        .back-to-top:hover {{ background: rgba(0, 123, 255, 1); }}
        .back-to-top.show {{ display: flex; }}
    </style>
</head>
<body>
<button class="theme-toggle" id="themeToggle" aria-label="Toggle dark mode">🌙</button>
<button class="back-to-top" id="backToTop" aria-label="Back to top">↑</button>
<article>
    <h1>{title}</h1>
    <p class="date">Publié le {date}</p>
    <div class="content">
        {content}
    </div>
</article>
<script src="../assets/highlight.min.js"></script>
<script src="../assets/highlightjs-line-numbers.min.js"></script>
<script>
    hljs.highlightAll();
    hljs.initLineNumbersOnLoad();

    // Add Copy Button
    document.querySelectorAll('pre code').forEach((codeBlock) => {{
        // 1. Add Copy Button to PRE
        const pre = codeBlock.parentNode;
        // Check if pre is actually a pre tag (safety)
        if (pre.tagName === 'PRE') {{
            const copyBtn = document.createElement('button');
            copyBtn.className = 'copy-button';
            copyBtn.textContent = 'Copier';
            
            // Store original text for copying
            const originalText = codeBlock.textContent;

            copyBtn.addEventListener('click', () => {{
                navigator.clipboard.writeText(originalText).then(() => {{
                    copyBtn.textContent = 'Copié !';
                    setTimeout(() => {{ copyBtn.textContent = 'Copier'; }}, 2000);
                }});
            }});
            
            pre.appendChild(copyBtn);
        }}
    }});
    
    // Dark Mode Toggle
    const themeToggle = document.getElementById('themeToggle');
    const body = document.body;
    
    // Load saved theme preference
    const savedTheme = localStorage.getItem('theme');
    if (savedTheme === 'light') {{
        body.classList.add('light-mode');
        themeToggle.textContent = '☀️';
    }}
    
    themeToggle.addEventListener('click', () => {{
        body.classList.toggle('light-mode');
        const isLight = body.classList.contains('light-mode');
        themeToggle.textContent = isLight ? '☀️' : '🌙';
        localStorage.setItem('theme', isLight ? 'light' : 'dark');
    }});
    
    // Back to Top Button
    const backToTop = document.getElementById('backToTop');
    
    window.addEventListener('scroll', () => {{
        if (window.pageYOffset > 300) {{
            backToTop.classList.add('show');
        }} else {{
            backToTop.classList.remove('show');
        }}
    }});
    
    backToTop.addEventListener('click', () => {{
        window.scrollTo({{
            top: 0,
            behavior: 'smooth'
        }});
    }});
</script>
</body>
</html>
"""

# =========================================================
# UTILITAIRES DE NETTOYAGE
# =========================================================

def parse_code_blocks(content):
    """
    Détecte et nettoie les blocs de code complexes, qu'ils soient 
    en HTML pur, échappés, ou dans des commentaires de blocs.
    """
    
    # 1. Capture le code dans les balises <textarea class="prettycode-source">
    # On gère les variantes d'encodage ( < vs &lt; )
    pattern_textarea = r'(?:<|&lt;)textarea[^>]*class=(?:"|&quot;)prettycode-source(?:"|&quot;)[^>]*>(.*?)(?:<|&lt;)/textarea>'
    
    def clean_textarea(match):
        code_raw = match.group(1)
        # Décodage multiple pour nettoyer les &amp;lt; ou &lt;
        code_decoded = html.unescape(html.unescape(code_raw.strip()))
        return f'<pre><code>{html.escape(code_decoded)}</code></pre>'

    content = re.sub(pattern_textarea, clean_textarea, content, flags=re.DOTALL | re.IGNORECASE)

    # 1.5 Capture le code dans les commentaires JSON Gutenberg wp:prettycode/code
    import json
    pattern_json_code = r'<!-- wp:prettycode/code ({.*?}) /-->'

    def clean_json_code(match):
        try:
            json_str = match.group(1)
            data = json.loads(json_str)
            code_raw = data.get('source', '')
            # Le contenu JSON est souvent encodé, mais "source" est une string brute dans le JSON
            # Cependant, on veut l'afficher proprement en HTML
            return f'<pre><code>{html.escape(code_raw)}</code></pre>'
        except Exception:
            return match.group(0)

    content = re.sub(pattern_json_code, clean_json_code, content, flags=re.DOTALL)

    # 2. Nettoyage des blocs de commentaires WordPress (Gutenberg)
    # Supprime <!-- wp:... --> et ses fermetures
    content = re.sub(r'<!-- wp:.*?-->', '', content)

    # 3. Nettoyage des DIV résiduelles de Prettycode (on garde juste le contenu)
    content = re.sub(r'(?:<|&lt;)div[^>]*class=(?:"|&quot;)[^"]*prettycode[^"]*(?:"|&quot;)[^>]*>', '', content)
    content = re.sub(r'(?:<|&lt;)/div>', '</div>', content) # On normalise les fermetures

    # 4. Nettoyage des scripts CodeMirror (inutiles en HTML statique)
    content = re.sub(r'(?:<|&lt;)script.*?(?:<|&lt;)/script>', '', content, flags=re.DOTALL | re.IGNORECASE)
    
    # 5. Nettoyage des headers Prettycode (Titre du fichier, langue, etc.)
    content = re.sub(r'(?:<|&lt;)header[^>]*prettycode-header.*?(?:<|&lt;)/header>', '', content, flags=re.DOTALL | re.IGNORECASE)

    return content

def safe_filename(url):
    parsed = urlparse(url)
    name = os.path.basename(parsed.path)
    if not name: return f"img_{hashlib.md5(url.encode()).hexdigest()[:8]}.jpg"
    name, ext = os.path.splitext(name)
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("ascii")
    name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
    h = hashlib.md5(url.encode()).hexdigest()[:8]
    return f"{name}_{h}{ext if ext else '.jpg'}"

def download_file(url, target_path):
    if os.path.exists(target_path): return
    try:
        print(f"Telechargement de {os.path.basename(target_path)}...")
        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=NETWORK_TIMEOUT) as response:
            with open(target_path, "wb") as f: f.write(response.read())
    except Exception as e:
        print(f"Erreur telechargement {url}: {e}")

def download_image(url, images_dir):
    filename = safe_filename(url)
    local_path = os.path.join(images_dir, filename)
    download_file(url, local_path)
    return filename if os.path.exists(local_path) else None

def process_images(html_content, images_dir):
    def replace_src(match):
        url = match.group(1)
        if url.startswith(("images/", "../images/")): return match.group(0)
        filename = download_image(url, images_dir)
        return f'src="../images/{filename}"' if filename else match.group(0)
    return re.sub(r'src="([^"]+)"', replace_src, html_content)

def process_attachments(html_content, files_dir):
    """
    Détecte les fichiers multimédias et documents dans les liens (href) 
    et les balises sources (src), les télécharge et localise les URLs.
    """
    # Regex pour capturer href="..." ou src="..." se terminant par nos extensions
    ext_pattern = '|'.join([re.escape(ext) for ext in ATTACHMENT_EXTENSIONS])
    # Cette regex capture l'attribut (href ou src) et l'URL
    pattern = rf'(href|src)="([^"]+({ext_pattern}))"'
    
    def download_file_match(match):
        attr = match.group(1)       # 'href' ou 'src'
        original_url = match.group(2)
        
        if original_url.startswith(("../files/", "files/")):
            return match.group(0)
            
        filename = safe_filename(original_url)
        local_path = os.path.join(files_dir, filename)
        
        # Téléchargement via votre fonction existante
        download_file(original_url, local_path)
        
        if os.path.exists(local_path):
            # On conserve l'attribut d'origine (href pour un lien, src pour un lecteur)
            return f'{attr}="../files/{filename}"'
        return match.group(0)

    return re.sub(pattern, download_file_match, html_content, flags=re.IGNORECASE)

# =========================================================
# POINT D'ENTRÉE
# =========================================================

def convert_xml(xml_file):
    """
    output_root = "site_genere"
    articles_dir = os.path.join(output_root, "articles")
    images_dir = os.path.join(output_root, "images")
    assets_dir = os.path.join(output_root, "assets")
    
    os.makedirs(articles_dir, exist_ok=True)
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(assets_dir, exist_ok=True)

    # Téléchargement des assets (JS/CSS)
    for filename, url in ASSETS_URLS.items():
        download_file(url, os.path.join(assets_dir, filename))
    """
    output_root = "site_genere"
    articles_dir = os.path.join(output_root, "articles")
    images_dir = os.path.join(output_root, "images")
    assets_dir = os.path.join(output_root, "assets")
    files_dir = os.path.join(output_root, "files")
    
    # --- ÉTAPE DE NETTOYAGE CIBLÉ ---
    if os.path.exists(articles_dir):
        print(f"Nettoyage du dossier des articles : {articles_dir}...")
        # On supprime uniquement le dossier des articles pour forcer une régénération propre
        shutil.rmtree(articles_dir)
    
    # On recrée les dossiers. S'ils existent déjà (images et assets), ils restent intacts.
    os.makedirs(articles_dir, exist_ok=True)
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(assets_dir, exist_ok=True)
    os.makedirs(files_dir, exist_ok=True)
    
    # Téléchargement des assets (JS/CSS)
    # Grâce à la vérification "if os.path.exists" dans download_file, 
    # ils ne seront retéléchargés QUE s'ils sont manquants.
    for filename, url in ASSETS_URLS.items():
        download_file(url, os.path.join(assets_dir, filename))

    print(f"Lecture de {xml_file}...")
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    count = 0
    for item in root.findall(".//item"):
        post_type = item.find("wp:post_type", NS)
        status = item.find("wp:status", NS)
        
        if post_type is not None and post_type.text == "post" and status.text == "publish":
            title = item.findtext("title")
            slug = item.findtext("wp:post_name", namespaces=NS)
            content = item.findtext("{http://purl.org/rss/1.0/modules/content/}encoded")
            date = item.findtext("wp:post_date", namespaces=NS)

            if not content: continue

            # --- TRAITEMENT ---
            # 1. On extrait le code avant tout (prioritaire)
            content = parse_code_blocks(content)
            # 2. On télécharge les images
            content = process_images(content, images_dir)
            # 3. On télécharge les médias : pdf,zip,mp3,mp4,...
            content = process_attachments(content, files_dir) 
             
            file_path = os.path.join(articles_dir, f"{slug}.html")
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(HTML_TEMPLATE.format(
                    title=html.escape(title),
                    content=content,
                    date=date.split(" ")[0] if date else "Inconnue"
                ))
            count += 1
            print(f"Genere : {slug}.html")

    print(f"\nSucces ! {count} articles crees dans '{output_root}'.")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python wp_xml_to_html.py votre_export.xml")
    else:
        convert_xml(sys.argv[1])
