You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mdp/fix_docx_orphaned_runs.py

43 lines
1.3 KiB
Python

#!/usr/bin/env python3
"""Remove orphaned <w:r> elements from DOCX"""
import zipfile
from pathlib import Path
import tempfile
import re
docx_path = '/sessions/dreamy-great-hypatia/mnt/mdp/Fabric prime/02_Appendix_A_Security_Privacy_Model_Fabric_Primary_v1.docx'
# Read the docx as a zip
with tempfile.TemporaryDirectory() as tmpdir:
# Extract
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(tmpdir)
# Read document.xml
doc_path = Path(tmpdir) / 'word' / 'document.xml'
with open(doc_path, 'r', encoding='utf-8') as f:
content = f.read()
print("Removing orphaned <w:r> runs...")
# Pattern: </w:p> followed by <w:r> followed by </w:r> (empty or whitespace-only runs)
# These should be removed
pattern = r'</w:p>(<w:r>(?:[ \n\t]*?)</w:r>)+'
content = re.sub(pattern, '</w:p>', content)
print(f"Cleaned document")
# Write back
with open(doc_path, 'w', encoding='utf-8') as f:
f.write(content)
# Repack the zip
with zipfile.ZipFile(docx_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in __import__('os').walk(tmpdir):
for file in files:
file_path = Path(root) / file
arcname = file_path.relative_to(tmpdir)
zipf.write(file_path, arcname)
print(f"Fixed {docx_path}")