將一個(gè)word文檔中的詞條內(nèi)容,按照另一個(gè)文本文件中的順序進(jìn)行重組,生成新的文件。
from docx import Document
from docx.shared import Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.table import Table
import re
from difflib import SequenceMatcher
from docx.table import _Cell, Table as DocxTable
from docx import Document as DocxDocument
from copy import deepcopy
def read_extracted_headings(filename):
"""讀取extracted_headings.txt文件,解析結(jié)構(gòu)"""
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
return content
def parse_structure(content):
"""解析文本結(jié)構(gòu),返回有序的關(guān)鍵詞列表"""
lines = content.strip().split('\n')
structure = []
current_part = None
current_section = None
for line in lines:
line = line.strip()
if not line:
continue
# 檢查是否是主要部分(第一部分、第二部分等)
if line.startswith('第') and '部分' in line:
current_part = line
structure.append({
'type': 'part',
'title': line,
'level': 0
})
current_section = None
# 檢查是否是子部分(A. B. C.等)
elif line.startswith(('A.', 'B.', 'C.', 'D.', 'E.', 'F.', 'G.', 'H.')):
current_section = line
structure.append({
'type': 'section',
'title': line,
'level': 1
})
# 其他都是詞條
else:
structure.append({
'type': 'term',
'title': line,
'level': 2,
'part': current_part,
'section': current_section
})
return structure
def analyze_document_styles(doc):
"""分析文檔中所有段落的樣式"""
styles_found = set()
print("正在分析文檔樣式...")
for paragraph in doc.paragraphs:
if paragraph.text.strip():
style_name = paragraph.style.name
styles_found.add(style_name)
print(f"文檔中發(fā)現(xiàn)的樣式: {sorted(styles_found)}")
return styles_found
def strip_brackets(text):
"""去除字符串中的所有括號(hào)及其內(nèi)容(支持中英文括號(hào))"""
# 去除所有()、()及其內(nèi)容
return re.sub(r'[\((][^\))]*[\))]', '', text)
def extract_terms_from_source_enhanced(source_doc):
"""增強(qiáng):提取所有關(guān)鍵詞及其內(nèi)容,內(nèi)容包括段落和表格,順序不丟失"""
possible_heading_styles = [
'Heading 2', '標(biāo)題2', '標(biāo)題 2', '樣式2', 'Heading2', '2級(jí)標(biāo)題', 'Title 2'
]
print("正在提取源文檔中的關(guān)鍵詞...")
styles_found = analyze_document_styles(source_doc)
actual_heading_styles = [style for style in possible_heading_styles if style in styles_found]
print(f"找到的標(biāo)題樣式: {actual_heading_styles}")
all_paragraphs = source_doc.paragraphs
# 找到所有標(biāo)題段落的索引
heading_indices = []
for i, para in enumerate(all_paragraphs):
if para.style.name in actual_heading_styles:
heading_indices.append((i, para.text.strip()))
print(f"總共找到 {len(heading_indices)} 個(gè)關(guān)鍵詞")
# 收集每個(gè)標(biāo)題下的內(nèi)容(段落和表格,順序不丟失)
terms_content = {}
# 構(gòu)建段落和表格的順序列表
elements = []
para_idx = 0
for el in source_doc.element.body:
if el.tag.endswith('p'):
if para_idx < len(all_paragraphs) and all_paragraphs[para_idx]._element == el:
elements.append(('paragraph', all_paragraphs[para_idx]))
para_idx += 1
elif el.tag.endswith('tbl'):
elements.append(('table', el))
# 按標(biāo)題分段收集
for idx, (start, heading) in enumerate(heading_indices):
end = heading_indices[idx+1][0] if idx+1 < len(heading_indices) else len(all_paragraphs)
# 找到當(dāng)前標(biāo)題在elements中的位置
# 先找到start段落在elements中的下標(biāo)
start_pos = None
para_count = 0
for i, (tp, obj) in enumerate(elements):
if tp == 'paragraph':
if para_count == start:
start_pos = i
break
para_count += 1
# end_pos: end段落在elements中的下標(biāo)
end_pos = None
para_count = 0
for i, (tp, obj) in enumerate(elements):
if tp == 'paragraph':
if para_count == end:
end_pos = i
break
para_count += 1
if start_pos is not None:
content_elements = elements[start_pos+1:end_pos] if end_pos is not None else elements[start_pos+1:]
else:
content_elements = []
terms_content[heading] = content_elements
print(f" └─ 為 '{heading}' 收集了 {len(content_elements)} 個(gè)內(nèi)容元素")
return terms_content
def similarity(a, b):
"""計(jì)算兩個(gè)字符串的相似度"""
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def fuzzy_match(target, candidates, threshold=0.6):
"""模糊匹配,返回最佳匹配和相似度,忽略括號(hào)內(nèi)容"""
best_match = None
best_score = 0
target_stripped = strip_brackets(target)
for candidate in candidates:
candidate_stripped = strip_brackets(candidate)
score = similarity(target_stripped, candidate_stripped)
if score > best_score and score >= threshold:
best_score = score
best_match = candidate
return best_match, best_score
def copy_paragraph_with_style(source_para, target_doc):
"""復(fù)制段落,保留所有樣式和格式"""
# 創(chuàng)建新段落
new_para = target_doc.add_paragraph()
# 復(fù)制段落級(jí)別的樣式
new_para.style = source_para.style
new_para.alignment = source_para.alignment
# 復(fù)制段落中的所有run(文本片段)
for run in source_para.runs:
new_run = new_para.add_run(run.text)
# 復(fù)制run級(jí)別的格式
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.size = run.font.size
new_run.font.name = run.font.name
# 復(fù)制字體顏色
if run.font.color.rgb:
new_run.font.color.rgb = run.font.color.rgb
# 復(fù)制其他可能的格式屬性
try:
new_run.font.highlight_color = run.font.highlight_color
except:
pass
# 復(fù)制段落間距
if source_para.paragraph_format.space_before:
new_para.paragraph_format.space_before = source_para.paragraph_format.space_before
if source_para.paragraph_format.space_after:
new_para.paragraph_format.space_after = source_para.paragraph_format.space_after
if source_para.paragraph_format.line_spacing:
new_para.paragraph_format.line_spacing = source_para.paragraph_format.line_spacing
return new_para
def copy_table_with_style(source_table_element, target_doc):
"""復(fù)制表格并保持所有樣式(結(jié)構(gòu)和內(nèi)容)"""
# 用docx的Document對(duì)象臨時(shí)包裝element再add_table
temp_doc = DocxDocument()
temp_doc._body.clear_content()
temp_doc._body._element.append(deepcopy(source_table_element))
table = temp_doc.tables[0]
rows = len(table.rows)
cols = len(table.columns)
if rows == 0 or cols == 0:
return None
# 在目標(biāo)文檔插入新表格
new_table = target_doc.add_table(rows=rows, cols=cols)
# 復(fù)制內(nèi)容
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
new_table.cell(i, j).text = cell.text
# 復(fù)制表格樣式
new_table.style = table.style
return new_table
def insert_empty_paragraph(target_doc):
from docx.oxml import OxmlElement
p = OxmlElement('w:p')
target_doc._body._element.append(p)
def create_new_document_ordered(source_doc, structure, source_terms):
"""按照結(jié)構(gòu)順序創(chuàng)建新文檔"""
new_doc = Document()
# 設(shè)置頁面邊距
sections = new_doc.sections
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)
term_count = 0
matched_terms = set()
print("\n開始創(chuàng)建新文檔...")
print(f"源文檔中找到 {len(source_terms)} 個(gè)關(guān)鍵詞")
print("=" * 50)
for item in structure:
if item['type'] == 'part':
# 添加部分標(biāo)題(占一頁)
print(f"\n創(chuàng)建部分: {item['title']}")
heading = new_doc.add_heading(item['title'], level=0)
heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
new_doc.add_page_break()
elif item['type'] == 'section':
# 添加子部分標(biāo)題(標(biāo)題1樣式)
print(f"\n創(chuàng)建章節(jié): {item['title']}")
new_doc.add_heading(item['title'], level=1)
elif item['type'] == 'term':
# 查找匹配的詞條
target_term = item['title']
source_term_names = list(source_terms.keys())
matched_term, similarity_score = fuzzy_match(target_term, source_term_names)
if matched_term and similarity_score >= 0.6:
term_count += 1
matched_terms.add(matched_term)
print(f"[{term_count}] 正在復(fù)制詞條: '{target_term}'")
if matched_term != target_term:
print(f" └─ 匹配到源詞條: '{matched_term}' (相似度: {similarity_score:.2f})")
# 添加詞條標(biāo)題(標(biāo)題2樣式)
new_doc.add_heading(target_term, level=2)
# 復(fù)制詞條內(nèi)容
content_elements = source_terms[matched_term]
if content_elements:
for tp, obj in content_elements:
if tp == 'paragraph':
copy_paragraph_with_style(obj, new_doc)
elif tp == 'table':
copy_table_with_style(obj, new_doc)
print(f" └─ 已復(fù)制 {len(content_elements)} 個(gè)內(nèi)容元素")
else:
print(f" └─ 警告: 未找到內(nèi)容")
new_doc.add_paragraph(f"[未找到 '{target_term}' 的相關(guān)內(nèi)容]")
insert_empty_paragraph(new_doc)
else:
print(f"[跳過] 未找到匹配的詞條: '{target_term}'")
if matched_term:
print(f" └─ 最佳匹配: '{matched_term}' (相似度: {similarity_score:.2f}, 低于閾值 0.6)")
# 仍然添加標(biāo)題,但標(biāo)注未找到內(nèi)容
new_doc.add_heading(target_term, level=2)
new_doc.add_paragraph(f"[未找到 '{target_term}' 的相關(guān)內(nèi)容]")
insert_empty_paragraph(new_doc)
print("=" * 50)
print(f"文檔創(chuàng)建完成!")
print(f"總共處理詞條: {term_count}")
print(f"成功匹配: {len(matched_terms)}")
return new_doc
if __name__ == '__main__':
# 文件名
source_docx = 'AI_zero2hero20250604.docx'
headings_txt = 'extracted_headings.txt'
output_docx = 'AI_zero2hero20250606.docx'
# 讀取結(jié)構(gòu)
headings_content = read_extracted_headings(headings_txt)
structure = parse_structure(headings_content)
# 只讀方式打開源docx
print(f"打開源文檔: {source_docx}")
source_doc = Document(source_docx)
# 提取所有關(guān)鍵詞及內(nèi)容
source_terms = extract_terms_from_source_enhanced(source_doc)
# 創(chuàng)建新文檔
print(f"\n將生成新文檔: {output_docx}")
new_doc = create_new_document_ordered(source_doc, structure, source_terms)
# 保存新文檔
new_doc.save(output_docx)
print(f"新文檔已保存: {output_docx}")