#!/usr/bin/env python3 """Surgically fix DOCX XML structure""" import zipfile from pathlib import Path import tempfile import re docx_path = '/sessions/dreamy-great-hypatia/mnt/mdp/Fabric prime/02_Appendix_A_Security_Privacy_Model_Fabric_Primary_v1.docx' # Read the docx as a zip with tempfile.TemporaryDirectory() as tmpdir: # Extract with zipfile.ZipFile(docx_path, 'r') as zip_ref: zip_ref.extractall(tmpdir) # Read document.xml doc_path = Path(tmpdir) / 'word' / 'document.xml' with open(doc_path, 'r', encoding='utf-8') as f: content = f.read() # Find the issue: look for orphaned tags before # Pattern: followed by followed by print("Searching for malformed structure...") # Find all tags and check what comes after matches = re.finditer(r'(.*?)*\s*', content, re.DOTALL) for match in matches: print(f"Found potential issue at position {match.start()}-{match.end()}") matched_text = match.group() print(f"Matched text sample: {matched_text[:200]}...") # Extract the portion without the orphaned tags cleaned = re.sub(r'(.*?)', '', matched_text) content = content.replace(matched_text, cleaned) print("Removed orphaned run elements") # Write back with open(doc_path, 'w', encoding='utf-8') as f: f.write(content) # Repack the zip with zipfile.ZipFile(docx_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in __import__('os').walk(tmpdir): for file in files: file_path = Path(root) / file arcname = file_path.relative_to(tmpdir) zipf.write(file_path, arcname) print(f"Fixed {docx_path}")