#!/usr/bin/env python3
"""Remove orphaned <w:r> elements from DOCX"""
import zipfile
from pathlib import Path
import tempfile
import re

docx_path = '/sessions/dreamy-great-hypatia/mnt/mdp/Fabric prime/02_Appendix_A_Security_Privacy_Model_Fabric_Primary_v1.docx'

# Read the docx as a zip
with tempfile.TemporaryDirectory() as tmpdir:
    # Extract
    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
        zip_ref.extractall(tmpdir)

    # Read document.xml
    doc_path = Path(tmpdir) / 'word' / 'document.xml'
    with open(doc_path, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Removing orphaned <w:r> runs...")

    # Pattern: </w:p> followed by <w:r> followed by </w:r> (empty or whitespace-only runs)
    # These should be removed
    pattern = r'</w:p>(<w:r>(?:[ \n\t]*?)</w:r>)+'
    content = re.sub(pattern, '</w:p>', content)

    print(f"Cleaned document")

    # Write back
    with open(doc_path, 'w', encoding='utf-8') as f:
        f.write(content)

    # Repack the zip
    with zipfile.ZipFile(docx_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in __import__('os').walk(tmpdir):
            for file in files:
                file_path = Path(root) / file
                arcname = file_path.relative_to(tmpdir)
                zipf.write(file_path, arcname)

print(f"Fixed {docx_path}")