mdp/fix_docx_surgical.py

#!/usr/bin/env python3
"""Surgically fix DOCX XML structure"""
import zipfile
from pathlib import Path
import tempfile
import re

docx_path = '/sessions/dreamy-great-hypatia/mnt/mdp/Fabric prime/02_Appendix_A_Security_Privacy_Model_Fabric_Primary_v1.docx'

# Read the docx as a zip
with tempfile.TemporaryDirectory() as tmpdir:
    # Extract
    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
        zip_ref.extractall(tmpdir)

    # Read document.xml
    doc_path = Path(tmpdir) / 'word' / 'document.xml'
    with open(doc_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Find the issue: look for orphaned <w:r> tags before </w:body>
    # Pattern: </w:p> followed by <w:r> followed by </w:body>
    print("Searching for malformed structure...")

    # Find all </w:p> tags and check what comes after
    matches = re.finditer(r'</w:p>(<w:r>.*?</w:r>)*\s*</w:body>', content, re.DOTALL)

    for match in matches:
        print(f"Found potential issue at position {match.start()}-{match.end()}")
        matched_text = match.group()
        print(f"Matched text sample: {matched_text[:200]}...")

        # Extract the portion without the orphaned <w:r> tags
        cleaned = re.sub(r'(<w:r>.*?</w:r>)', '', matched_text)
        content = content.replace(matched_text, cleaned)
        print("Removed orphaned run elements")

    # Write back
    with open(doc_path, 'w', encoding='utf-8') as f:
        f.write(content)

    # Repack the zip
    with zipfile.ZipFile(docx_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in __import__('os').walk(tmpdir):
            for file in files:
                file_path = Path(root) / file
                arcname = file_path.relative_to(tmpdir)
                zipf.write(file_path, arcname)

print(f"Fixed {docx_path}")