You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Surgically fix DOCX XML structure"""
|
|
import zipfile
|
|
from pathlib import Path
|
|
import tempfile
|
|
import re
|
|
|
|
docx_path = '/sessions/dreamy-great-hypatia/mnt/mdp/Fabric prime/02_Appendix_A_Security_Privacy_Model_Fabric_Primary_v1.docx'
|
|
|
|
# Read the docx as a zip
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
# Extract
|
|
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
|
zip_ref.extractall(tmpdir)
|
|
|
|
# Read document.xml
|
|
doc_path = Path(tmpdir) / 'word' / 'document.xml'
|
|
with open(doc_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Find the issue: look for orphaned <w:r> tags before </w:body>
|
|
# Pattern: </w:p> followed by <w:r> followed by </w:body>
|
|
print("Searching for malformed structure...")
|
|
|
|
# Find all </w:p> tags and check what comes after
|
|
matches = re.finditer(r'</w:p>(<w:r>.*?</w:r>)*\s*</w:body>', content, re.DOTALL)
|
|
|
|
for match in matches:
|
|
print(f"Found potential issue at position {match.start()}-{match.end()}")
|
|
matched_text = match.group()
|
|
print(f"Matched text sample: {matched_text[:200]}...")
|
|
|
|
# Extract the portion without the orphaned <w:r> tags
|
|
cleaned = re.sub(r'(<w:r>.*?</w:r>)', '', matched_text)
|
|
content = content.replace(matched_text, cleaned)
|
|
print("Removed orphaned run elements")
|
|
|
|
# Write back
|
|
with open(doc_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
# Repack the zip
|
|
with zipfile.ZipFile(docx_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
for root, dirs, files in __import__('os').walk(tmpdir):
|
|
for file in files:
|
|
file_path = Path(root) / file
|
|
arcname = file_path.relative_to(tmpdir)
|
|
zipf.write(file_path, arcname)
|
|
|
|
print(f"Fixed {docx_path}")
|