#!/usr/bin/env python3 """Remove orphaned elements from DOCX""" import zipfile from pathlib import Path import tempfile import re docx_path = '/sessions/dreamy-great-hypatia/mnt/mdp/Fabric prime/02_Appendix_A_Security_Privacy_Model_Fabric_Primary_v1.docx' # Read the docx as a zip with tempfile.TemporaryDirectory() as tmpdir: # Extract with zipfile.ZipFile(docx_path, 'r') as zip_ref: zip_ref.extractall(tmpdir) # Read document.xml doc_path = Path(tmpdir) / 'word' / 'document.xml' with open(doc_path, 'r', encoding='utf-8') as f: content = f.read() print("Removing orphaned runs...") # Pattern: followed by followed by (empty or whitespace-only runs) # These should be removed pattern = r'((?:[ \n\t]*?))+' content = re.sub(pattern, '', content) print(f"Cleaned document") # Write back with open(doc_path, 'w', encoding='utf-8') as f: f.write(content) # Repack the zip with zipfile.ZipFile(docx_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in __import__('os').walk(tmpdir): for file in files: file_path = Path(root) / file arcname = file_path.relative_to(tmpdir) zipf.write(file_path, arcname) print(f"Fixed {docx_path}")