精彩妇女av,99久草视频

將一個(gè)word文檔中的詞條內(nèi)容，按照另一個(gè)文本文件中的順序進(jìn)行重組，生成新的文件。
from docx import Document
from docx.shared import Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.table import Table
import re
from difflib import SequenceMatcher
from docx.table import _Cell, Table as DocxTable
from docx import Document as DocxDocument
from copy import deepcopy

def read_extracted_headings(filename):
    """讀取extracted_headings.txt文件，解析結(jié)構(gòu)"""
    with open(filename, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

def parse_structure(content):
    """解析文本結(jié)構(gòu)，返回有序的關(guān)鍵詞列表"""
    lines = content.strip().split('\n')
    structure = []
    current_part = None
    current_section = None
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # 檢查是否是主要部分（第一部分、第二部分等）
        if line.startswith('第') and '部分' in line:
            current_part = line
            structure.append({
                'type': 'part',
                'title': line,
                'level': 0
            })
            current_section = None
        # 檢查是否是子部分（A. B. C.等）
        elif line.startswith(('A.', 'B.', 'C.', 'D.', 'E.', 'F.', 'G.', 'H.')):
            current_section = line
            structure.append({
                'type': 'section',
                'title': line,
                'level': 1
            })
        # 其他都是詞條
        else:
            structure.append({
                'type': 'term',
                'title': line,
                'level': 2,
                'part': current_part,
                'section': current_section
            })
    
    return structure

def analyze_document_styles(doc):
    """分析文檔中所有段落的樣式"""
    styles_found = set()
    
    print("正在分析文檔樣式...")
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            style_name = paragraph.style.name
            styles_found.add(style_name)
    
    print(f"文檔中發(fā)現(xiàn)的樣式: {sorted(styles_found)}")
    return styles_found

def strip_brackets(text):
    """去除字符串中的所有括號(hào)及其內(nèi)容（支持中英文括號(hào)）"""
    # 去除所有()、（）及其內(nèi)容
    return re.sub(r'[\(（][^\)）]*[\)）]', '', text)

def extract_terms_from_source_enhanced(source_doc):
    """增強(qiáng)：提取所有關(guān)鍵詞及其內(nèi)容，內(nèi)容包括段落和表格，順序不丟失"""
    possible_heading_styles = [
        'Heading 2', '標(biāo)題2', '標(biāo)題 2', '樣式2', 'Heading2', '2級(jí)標(biāo)題', 'Title 2'
    ]
    print("正在提取源文檔中的關(guān)鍵詞...")
    styles_found = analyze_document_styles(source_doc)
    actual_heading_styles = [style for style in possible_heading_styles if style in styles_found]
    print(f"找到的標(biāo)題樣式: {actual_heading_styles}")
    all_paragraphs = source_doc.paragraphs
    # 找到所有標(biāo)題段落的索引
    heading_indices = []
    for i, para in enumerate(all_paragraphs):
        if para.style.name in actual_heading_styles:
            heading_indices.append((i, para.text.strip()))
    print(f"總共找到 {len(heading_indices)} 個(gè)關(guān)鍵詞")
    # 收集每個(gè)標(biāo)題下的內(nèi)容（段落和表格，順序不丟失）
    terms_content = {}
    # 構(gòu)建段落和表格的順序列表
    elements = []
    para_idx = 0
    for el in source_doc.element.body:
        if el.tag.endswith('p'):
            if para_idx < len(all_paragraphs) and all_paragraphs[para_idx]._element == el:
                elements.append(('paragraph', all_paragraphs[para_idx]))
                para_idx += 1
        elif el.tag.endswith('tbl'):
            elements.append(('table', el))
    # 按標(biāo)題分段收集
    for idx, (start, heading) in enumerate(heading_indices):
        end = heading_indices[idx+1][0] if idx+1 < len(heading_indices) else len(all_paragraphs)
        # 找到當(dāng)前標(biāo)題在elements中的位置
        # 先找到start段落在elements中的下標(biāo)
        start_pos = None
        para_count = 0
        for i, (tp, obj) in enumerate(elements):
            if tp == 'paragraph':
                if para_count == start:
                    start_pos = i
                    break
                para_count += 1
        # end_pos: end段落在elements中的下標(biāo)
        end_pos = None
        para_count = 0
        for i, (tp, obj) in enumerate(elements):
            if tp == 'paragraph':
                if para_count == end:
                    end_pos = i
                    break
                para_count += 1
        if start_pos is not None:
            content_elements = elements[start_pos+1:end_pos] if end_pos is not None else elements[start_pos+1:]
        else:
            content_elements = []
        terms_content[heading] = content_elements
        print(f"  └─ 為 '{heading}' 收集了 {len(content_elements)} 個(gè)內(nèi)容元素")
    return terms_content

def similarity(a, b):
    """計(jì)算兩個(gè)字符串的相似度"""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def fuzzy_match(target, candidates, threshold=0.6):
    """模糊匹配，返回最佳匹配和相似度，忽略括號(hào)內(nèi)容"""
    best_match = None
    best_score = 0
    target_stripped = strip_brackets(target)
    for candidate in candidates:
        candidate_stripped = strip_brackets(candidate)
        score = similarity(target_stripped, candidate_stripped)
        if score > best_score and score >= threshold:
            best_score = score
            best_match = candidate
    return best_match, best_score

def copy_paragraph_with_style(source_para, target_doc):
    """復(fù)制段落，保留所有樣式和格式"""
    # 創(chuàng)建新段落
    new_para = target_doc.add_paragraph()
    
    # 復(fù)制段落級(jí)別的樣式
    new_para.style = source_para.style
    new_para.alignment = source_para.alignment
    
    # 復(fù)制段落中的所有run（文本片段）
    for run in source_para.runs:
        new_run = new_para.add_run(run.text)
        
        # 復(fù)制run級(jí)別的格式
        new_run.bold = run.bold
        new_run.italic = run.italic
        new_run.underline = run.underline
        new_run.font.size = run.font.size
        new_run.font.name = run.font.name
        
        # 復(fù)制字體顏色
        if run.font.color.rgb:
            new_run.font.color.rgb = run.font.color.rgb
            
        # 復(fù)制其他可能的格式屬性
        try:
            new_run.font.highlight_color = run.font.highlight_color
        except:
            pass
    
    # 復(fù)制段落間距
    if source_para.paragraph_format.space_before:
        new_para.paragraph_format.space_before = source_para.paragraph_format.space_before
    if source_para.paragraph_format.space_after:
        new_para.paragraph_format.space_after = source_para.paragraph_format.space_after
    if source_para.paragraph_format.line_spacing:
        new_para.paragraph_format.line_spacing = source_para.paragraph_format.line_spacing
        
    return new_para


def copy_table_with_style(source_table_element, target_doc):
    """復(fù)制表格并保持所有樣式（結(jié)構(gòu)和內(nèi)容）"""
    # 用docx的Document對(duì)象臨時(shí)包裝element再add_table
    temp_doc = DocxDocument()
    temp_doc._body.clear_content()
    temp_doc._body._element.append(deepcopy(source_table_element))
    table = temp_doc.tables[0]
    rows = len(table.rows)
    cols = len(table.columns)
    if rows == 0 or cols == 0:
        return None
    # 在目標(biāo)文檔插入新表格
    new_table = target_doc.add_table(rows=rows, cols=cols)
    # 復(fù)制內(nèi)容
    for i, row in enumerate(table.rows):
        for j, cell in enumerate(row.cells):
            new_table.cell(i, j).text = cell.text
    # 復(fù)制表格樣式
    new_table.style = table.style
    return new_table

def insert_empty_paragraph(target_doc):
    from docx.oxml import OxmlElement
    p = OxmlElement('w:p')
    target_doc._body._element.append(p)

def create_new_document_ordered(source_doc, structure, source_terms):
    """按照結(jié)構(gòu)順序創(chuàng)建新文檔"""
    new_doc = Document()
    
    # 設(shè)置頁面邊距
    sections = new_doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
    
    term_count = 0
    matched_terms = set()
    
    print("\n開始創(chuàng)建新文檔...")
    print(f"源文檔中找到 {len(source_terms)} 個(gè)關(guān)鍵詞")
    print("=" * 50)
    
    for item in structure:
        if item['type'] == 'part':
            # 添加部分標(biāo)題（占一頁）
            print(f"\n創(chuàng)建部分: {item['title']}")
            heading = new_doc.add_heading(item['title'], level=0)
            heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
            new_doc.add_page_break()
            
        elif item['type'] == 'section':
            # 添加子部分標(biāo)題（標(biāo)題1樣式）
            print(f"\n創(chuàng)建章節(jié): {item['title']}")
            new_doc.add_heading(item['title'], level=1)
            
        elif item['type'] == 'term':
            # 查找匹配的詞條
            target_term = item['title']
            source_term_names = list(source_terms.keys())
            
            matched_term, similarity_score = fuzzy_match(target_term, source_term_names)
            
            if matched_term and similarity_score >= 0.6:
                term_count += 1
                matched_terms.add(matched_term)
                
                print(f"[{term_count}] 正在復(fù)制詞條: '{target_term}'")
                if matched_term != target_term:
                    print(f"    └─ 匹配到源詞條: '{matched_term}' (相似度: {similarity_score:.2f})")
                
                # 添加詞條標(biāo)題（標(biāo)題2樣式）
                new_doc.add_heading(target_term, level=2)
                
                # 復(fù)制詞條內(nèi)容
                content_elements = source_terms[matched_term]
                
                if content_elements:
                    for tp, obj in content_elements:
                        if tp == 'paragraph':
                            copy_paragraph_with_style(obj, new_doc)
                        elif tp == 'table':
                            copy_table_with_style(obj, new_doc)
                    
                    print(f"    └─ 已復(fù)制 {len(content_elements)} 個(gè)內(nèi)容元素")
                else:
                    print(f"    └─ 警告: 未找到內(nèi)容")
                    new_doc.add_paragraph(f"[未找到 '{target_term}' 的相關(guān)內(nèi)容]")
                
                insert_empty_paragraph(new_doc)
            else:
                print(f"[跳過] 未找到匹配的詞條: '{target_term}'")
                if matched_term:
                    print(f"    └─ 最佳匹配: '{matched_term}' (相似度: {similarity_score:.2f}, 低于閾值 0.6)")
                
                # 仍然添加標(biāo)題，但標(biāo)注未找到內(nèi)容
                new_doc.add_heading(target_term, level=2)
                new_doc.add_paragraph(f"[未找到 '{target_term}' 的相關(guān)內(nèi)容]")
                insert_empty_paragraph(new_doc)
    
    print("=" * 50)
    print(f"文檔創(chuàng)建完成！")
    print(f"總共處理詞條: {term_count}")
    print(f"成功匹配: {len(matched_terms)}")
    return new_doc

if __name__ == '__main__':
    # 文件名
    source_docx = 'AI_zero2hero20250604.docx'
    headings_txt = 'extracted_headings.txt'
    output_docx = 'AI_zero2hero20250606.docx'

    # 讀取結(jié)構(gòu)
    headings_content = read_extracted_headings(headings_txt)
    structure = parse_structure(headings_content)

    # 只讀方式打開源docx
    print(f"打開源文檔: {source_docx}")
    source_doc = Document(source_docx)

    # 提取所有關(guān)鍵詞及內(nèi)容
    source_terms = extract_terms_from_source_enhanced(source_doc)

    # 創(chuàng)建新文檔
    print(f"\n將生成新文檔: {output_docx}")
    new_doc = create_new_document_ordered(source_doc, structure, source_terms)

    # 保存新文檔
    new_doc.save(output_docx)
    print(f"新文檔已保存: {output_docx}")
色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

word文檔詞條重新整理復(fù)制

word文檔詞條重新整理復(fù)制

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

word文檔詞條重新整理復(fù)制

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av