logo
Loading...

RAG 組件 - 分割器 (text splitter) - AI Agent 開發特訓營:短期實現智能自動化 - Cupoy

LangChain 文字分割器實戰指南 課程相關code我都上完到github空間中:請 git clone https://github.com/kevin801221/AgenticU-The-M...

LangChain 文字分割器實戰指南 課程相關code我都上完到github空間中:請 git clone https://github.com/kevin801221/AgenticU-The-Modular-Teaching-Hub-for-Modern-LLM-Agent-Frameworks.git 去下載相關程式碼。(可以幫忙點個星星唷!) 📚 參考資源 官方文檔 LangChain Text Splitters 官方文檔 Text Splitters API 參考 相關工具與技術 OpenAI Embeddings - 語意分割所需的向量化模型 Tiktoken - OpenAI 的 Token 計算工具 Semantic Chunking 研究 - 語意分割的理論基礎 目錄 為什麼需要文字分割 分割器選擇指南 基礎文字分割器 遞迴文字分割器 專用格式分割器 語意分割器 進階技巧與最佳實踐 實際應用案例 為什麼需要文字分割 核心問題 在 RAG 系統中,文字分割是必要的,因為: Token 限制:LLM 有上下文長度限制(如 GPT-4 的 8K-32K tokens) 檢索精確性:小區塊比大文件更容易精確匹配查詢 成本控制:較小的上下文意味著更低的 API 調用成本 處理效率:向量化和相似性計算在小區塊上更快 分割的關鍵考量 # 基本分割參數 split_config = { "chunk_size": 1000, # 區塊大小(字符數) "chunk_overlap": 200, # 重疊字符數 "separator": "\n\n" # 分隔符 } # 重疊的重要性 """ 區塊 1: "...人工智能的發展歷程可以追溯到1950年代。" ↓ 200字符重疊 區塊 2: "1950年代,計算機科學家開始探索機器學習..." """ 分割器選擇指南 內容類型 推薦分割器 主要優勢 一般文字 RecursiveCharacterTextSplitter 智能保留語意結構 程式碼 RecursiveCharacterTextSplitter.from_language() 理解語法結構 Markdown MarkdownHeaderTextSplitter 保留標題層次 HTML HTMLHeaderTextSplitter 按標題標籤分割 JSON RecursiveJsonSplitter 保持JSON結構完整 高語意要求 SemanticChunker 基於語意相似性 基礎文字分割器 CharacterTextSplitter 最簡單的分割器,適合結構化程度較低的文字: from langchain_text_splitters import CharacterTextSplitter # 基本使用 text_splitter = CharacterTextSplitter( separator="\n\n", # 分隔符 chunk_size=300, # 區塊大小 chunk_overlap=50, # 重疊大小 length_function=len, # 長度計算函數 ) # 分割文字 docs = text_splitter.create_documents([long_text]) # 分割已有的文檔 split_docs = text_splitter.split_documents(existing_docs) 使用場景與注意事項 def analyze_split_quality(docs): """分析分割品質""" sizes = [len(doc.page_content) for doc in docs] print(f"📊 分割分析:") print(f" 總區塊數: {len(docs)}") print(f" 平均大小: {sum(sizes)/len(sizes):.0f} 字符") print(f" 最大區塊: {max(sizes)} 字符") print(f" 最小區塊: {min(sizes)} 字符") # 檢查是否有過小或過大的區塊 too_small = len([s for s in sizes if s < 50]) too_large = len([s for s in sizes if s > 2000]) if too_small > 0: print(f" ⚠️ {too_small} 個區塊過小 (<50 字符)") if too_large > 0: print(f" ⚠️ {too_large} 個區塊過大 (>2000 字符)") # 使用範例 docs = text_splitter.create_documents([sample_text]) analyze_split_quality(docs) 遞迴文字分割器 RecursiveCharacterTextSplitter 官方推薦的通用分割器,智能地按語意結構分割: from langchain_text_splitters import RecursiveCharacterTextSplitter # 基本配置 text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len, is_separator_regex=False, ) # 遞迴分割邏輯 default_separators = [ "\n\n", # 段落分割(優先) "\n", # 行分割 " ", # 詞分割 "" # 字符分割(最後手段) ] 自定義分隔符 def create_custom_splitter(content_type: str): """根據內容類型創建自定義分割器""" if content_type == "academic_paper": # 學術論文:按節和段落分割 separators = ["\n\n\n", "\n\n", "\n", ". ", " "] elif content_type == "legal_document": # 法律文件:按條款分割 separators = ["\n第", "\n(", "\n\n", "\n", " "] elif content_type == "technical_manual": # 技術手冊:按步驟和段落分割 separators = ["\n步驟", "\n##", "\n\n", "\n", " "] else: # 預設配置 separators = ["\n\n", "\n", " ", ""] return RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=separators ) # 使用範例 academic_splitter = create_custom_splitter("academic_paper") docs = academic_splitter.create_documents([paper_text]) Token 基礎分割 def create_token_based_splitter(model_name: str = "gpt-4"): """創建基於 Token 計算的分割器""" import tiktoken # 獲取對應模型的編碼器 try: encoding = tiktoken.encoding_for_model(model_name) except KeyError: encoding = tiktoken.get_encoding("cl100k_base") # 預設編碼 def token_len(text: str) -> int: """計算文字的 token 數量""" return len(encoding.encode(text)) return RecursiveCharacterTextSplitter( chunk_size=1000, # 1000 tokens chunk_overlap=200, # 200 tokens 重疊 length_function=token_len, is_separator_regex=False, ) # 使用範例 token_splitter = create_token_based_splitter("gpt-4") docs = token_splitter.create_documents([text]) 專用格式分割器 程式碼分割器 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter # 支援的程式語言 supported_languages = [ Language.PYTHON, Language.JAVASCRIPT, Language.TYPESCRIPT, Language.JAVA, Language.CPP, Language.GO, Language.RUST, Language.MARKDOWN ] # Python 程式碼分割 python_splitter = RecursiveCharacterTextSplitter.from_language( language=Language.PYTHON, chunk_size=2000, chunk_overlap=200, ) # 分割 Python 程式碼 python_code = """ class DataProcessor: def __init__(self, data): self.data = data def process(self): # 處理資料的邏輯 return processed_data def main(): processor = DataProcessor(raw_data) result = processor.process() return result """ docs = python_splitter.create_documents([python_code]) # 檢查分割結果 for i, doc in enumerate(docs): print(f"區塊 {i+1}:") print(doc.page_content) print("-" * 40) Markdown 分割器 from langchain_text_splitters import MarkdownHeaderTextSplitter # 定義要分割的標題層級 headers_to_split_on = [ ("#", "Header 1"), # H1 標題 ("##", "Header 2"), # H2 標題 ("###", "Header 3"), # H3 標題 ] markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, strip_headers=False, # 保留標題 ) # 範例 Markdown 文檔 markdown_doc = """ # 機器學習簡介 機器學習是人工智能的一個重要分支。 ## 監督學習 監督學習使用標記的資料進行訓練。 ### 分類問題 分類是預測離散標籤的任務。 ### 回歸問題 回歸是預測連續數值的任務。 ## 無監督學習 無監督學習從未標記的資料中學習模式。 """ # 執行分割 md_header_splits = markdown_splitter.split_text(markdown_doc) # 檢查結果和 metadata for split in md_header_splits: print(f"內容: {split.page_content[:50]}...") print(f"Metadata: {split.metadata}") print("-" * 50) JSON 分割器 from langchain_text_splitters import RecursiveJsonSplitter # JSON 資料範例 json_data = { "users": [ { "id": 1, "name": "Alice", "profile": { "age": 30, "city": "台北", "interests": ["閱讀", "旅行", "程式設計"] } }, { "id": 2, "name": "Bob", "profile": { "age": 25, "city": "高雄", "interests": ["音樂", "運動"] } } ], "metadata": { "version": "1.0", "created_at": "2024-01-01" } } # 創建 JSON 分割器 json_splitter = RecursiveJsonSplitter(max_chunk_size=300) # 分割 JSON json_chunks = json_splitter.split_json(json_data=json_data) # 檢查分割結果 for i, chunk in enumerate(json_chunks): print(f"JSON 區塊 {i+1}:") print(chunk.page_content) print("-" * 30) 語意分割器 SemanticChunker 基於語意相似性的智能分割器: from langchain_experimental.text_splitter import SemanticChunker from langchain_openai.embeddings import OpenAIEmbeddings # 創建語意分割器 embeddings = OpenAIEmbeddings() semantic_splitter = SemanticChunker( embeddings, breakpoint_threshold_type="percentile", # 閾值類型 breakpoint_threshold_amount=95 # 在相似度最低的5%處分割 ) # 不同的閾值類型 threshold_types = { "percentile": "百分位數 - 在相似度差異最大的地方分割", "standard_deviation": "標準差 - 基於統計分佈分割", "interquartile": "四分位距 - 基於中位數分割" } # 使用範例 long_article = """ 人工智能的發展可以分為幾個重要階段。早期的人工智能研究主要集中在符號推理和專家系統上。 機器學習的出現標誌著AI發展的新階段。通過從資料中學習模式,機器能夠在沒有明確程式設計的情況下改善性能。 深度學習進一步推動了AI的發展。神經網路的多層結構使得機器能夠學習更複雜的表示和模式。 如今,大型語言模型代表了AI的最新發展。這些模型能夠理解和生成人類語言,開啟了新的應用可能性。 """ docs = semantic_splitter.create_documents([long_article]) # 分析語意分割效果 def analyze_semantic_splits(docs): """分析語意分割效果""" print(f"🧠 語意分割分析:") print(f" 總區塊數: {len(docs)}") for i, doc in enumerate(docs): sentences = doc.page_content.split('。') print(f"\n 區塊 {i+1} ({len(doc.page_content)} 字符):") print(f" 句子數: {len([s for s in sentences if s.strip()])}") print(f" 內容: {doc.page_content[:100]}...") analyze_semantic_splits(docs) 語意分割進階配置 def create_adaptive_semantic_splitter(content_type: str): """根據內容類型創建自適應語意分割器""" configs = { "technical": { "threshold_type": "standard_deviation", "threshold_amount": 1.5, # 較嚴格的分割 "description": "技術文檔需要精確的語意邊界" }, "narrative": { "threshold_type": "percentile", "threshold_amount": 90, # 較寬鬆的分割 "description": "敘述性文字允許更長的語意區塊" }, "mixed": { "threshold_type": "interquartile", "threshold_amount": 1.25, # 中等分割 "description": "混合內容採用平衡策略" } } config = configs.get(content_type, configs["mixed"]) return SemanticChunker( embeddings=OpenAIEmbeddings(), breakpoint_threshold_type=config["threshold_type"], breakpoint_threshold_amount=config["threshold_amount"] ), config["description"] # 使用範例 tech_splitter, description = create_adaptive_semantic_splitter("technical") print(f"配置說明: {description}") 進階技巧與最佳實踐 混合分割策略 def hybrid_text_splitting(text: str, content_type: str = "mixed"): """混合分割策略:結合多種分割器的優勢""" from langchain.schema import Document # 第一階段:結構化分割 if "markdown" in content_type.lower(): # 先按標題分割 md_splitter = MarkdownHeaderTextSplitter([ ("#", "Header 1"), ("##", "Header 2"), ]) initial_docs = md_splitter.split_text(text) else: # 一般文字先做基本分割 initial_docs = [Document(page_content=text)] # 第二階段:遞迴細分 recursive_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, ) final_docs = [] for doc in initial_docs: if len(doc.page_content) > 1000: # 大區塊進一步分割 sub_docs = recursive_splitter.split_documents([doc]) final_docs.extend(sub_docs) else: final_docs.append(doc) return final_docs # 使用範例 hybrid_docs = hybrid_text_splitting(sample_text, "markdown") 分割品質評估 def evaluate_split_quality(docs: list, original_text: str): """評估分割品質""" # 基本統計 chunk_sizes = [len(doc.page_content) for doc in docs] total_chars = sum(chunk_sizes) # 計算資訊保留度 original_length = len(original_text) retention_rate = total_chars / original_length # 計算語意連貫性(簡化版) coherence_scores = [] for doc in docs: sentences = doc.page_content.split('。') if len(sentences) > 1: # 簡單評估:檢查是否有不完整的句子 incomplete = sum(1 for s in sentences if len(s.strip()) < 10) coherence = 1 - (incomplete / len(sentences)) coherence_scores.append(coherence) avg_coherence = sum(coherence_scores) / len(coherence_scores) if coherence_scores else 0 # 生成評估報告 report = { "總區塊數": len(docs), "平均區塊大小": sum(chunk_sizes) / len(chunk_sizes), "資訊保留率": f"{retention_rate:.2%}", "語意連貫性": f"{avg_coherence:.2%}", "大小分佈": { "最小": min(chunk_sizes), "最大": max(chunk_sizes), "標準差": (sum((x - sum(chunk_sizes)/len(chunk_sizes))**2 for x in chunk_sizes) / len(chunk_sizes))**0.5 } } return report # 使用範例 quality_report = evaluate_split_quality(docs, original_text) print("📊 分割品質評估:") for key, value in quality_report.items(): print(f" {key}: {value}") 動態分割參數調整 def adaptive_chunk_size(text: str, target_chunks: int = None): """根據文本長度自適應調整分割參數""" text_length = len(text) if target_chunks: # 根據目標區塊數計算大小 chunk_size = text_length // target_chunks chunk_size = max(500, min(chunk_size, 2000)) # 限制在合理範圍 else: # 根據文本長度自動調整 if text_length < 2000: chunk_size = 500 elif text_length < 10000: chunk_size = 1000 else: chunk_size = 1500 # 重疊大小為區塊大小的 15-20% chunk_overlap = int(chunk_size * 0.175) return { "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "estimated_chunks": text_length // (chunk_size - chunk_overlap) } # 使用範例 text_stats = adaptive_chunk_size(sample_text, target_chunks=10) print(f"建議參數: {text_stats}") # 應用建議參數 splitter = RecursiveCharacterTextSplitter(**text_stats) docs = splitter.create_documents([sample_text]) 實際應用案例 案例 1:技術文檔處理 def process_technical_documentation(docs): """處理技術文檔的專用流程""" # 1. 識別內容類型 def identify_content_type(text): if "```" in text and "def " in text: return "code_heavy" elif "# " in text and "## " in text: return "markdown" else: return "general" processed_docs = [] for doc in docs: content_type = identify_content_type(doc.page_content) # 2. 選擇適當的分割器 if content_type == "code_heavy": splitter = RecursiveCharacterTextSplitter.from_language( language=Language.PYTHON, chunk_size=1500, chunk_overlap=200 ) elif content_type == "markdown": # 先按標題分割,再遞迴細分 md_splitter = MarkdownHeaderTextSplitter([ ("##", "Section"), ("###", "Subsection") ]) md_splits = md_splitter.split_text(doc.page_content) recursive_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=150 ) for split in md_splits: if len(split.page_content) > 1000: sub_splits = recursive_splitter.split_documents([split]) processed_docs.extend(sub_splits) else: processed_docs.append(split) continue else: splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) # 3. 執行分割 splits = splitter.split_documents([doc]) processed_docs.extend(splits) return processed_docs 案例 2:多語言內容處理 def multilingual_text_splitting(text: str, primary_language: str = "zh"): """多語言文本分割""" import re # 語言特定的分隔符 separators_config = { "zh": ["\n\n", "\n", "。", ";", ",", " "], # 中文 "en": ["\n\n", "\n", ". ", "; ", ", ", " "], # 英文 "ja": ["\n\n", "\n", "。", "、", " "], # 日文 } # 檢測文本中的語言分佈 def detect_language_sections(text): # 簡化的語言檢測 chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) english_chars = len(re.findall(r'[a-zA-Z]', text)) japanese_chars = len(re.findall(r'[\u3040-\u309f\u30a0-\u30ff]', text)) total_chars = chinese_chars + english_chars + japanese_chars if total_chars == 0: return primary_language ratios = { "zh": chinese_chars / total_chars, "en": english_chars / total_chars, "ja": japanese_chars / total_chars } return max(ratios, key=ratios.get) detected_lang = detect_language_sections(text) separators = separators_config.get(detected_lang, separators_config["en"]) # 創建適合的分割器 splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=separators ) docs = splitter.create_documents([text]) # 為每個文檔添加語言資訊 for doc in docs: doc.metadata["detected_language"] = detected_lang doc.metadata["language_confidence"] = detect_language_sections(doc.page_content) return docs 案例 3:學術論文處理 def academic_paper_processing(paper_text: str): """學術論文專用處理流程""" # 1. 識別論文結構 sections = { "abstract": r"(abstract|摘要)", "introduction": r"(introduction|引言|前言)", "methodology": r"(methodology|method|方法)", "results": r"(results|結果)", "discussion": r"(discussion|討論)", "conclusion": r"(conclusion|結論)", "references": r"(references|參考文獻)" } # 2. 按節分割 section_docs = [] import re for section_name, pattern in sections.items(): matches = list(re.finditer(pattern, paper_text, re.IGNORECASE)) if matches: for match in matches: # 找到該節的開始位置 start = match.start() # 找到下一節的開始位置 next_section_start = len(paper_text) for next_pattern in sections.values(): next_matches = list(re.finditer(next_pattern, paper_text[start+100:], re.IGNORECASE)) if next_matches: next_section_start = min(next_section_start, start + 100 + next_matches[0].start()) section_text = paper_text[start:next_section_start].strip() if len(section_text) > 100: # 過濾太短的內容 doc = Document( page_content=section_text, metadata={ "section": section_name, "document_type": "academic_paper" } ) section_docs.append(doc) # 3. 對長節進一步分割 final_docs = [] for doc in section_docs: if len(doc.page_content) > 1500: # 根據節的類型選擇分割策略 if doc.metadata["section"] in ["methodology", "results"]: # 方法和結果部分可能包含較多技術細節 splitter = RecursiveCharacterTextSplitter( chunk_size=1200, chunk_overlap=200, separators=["\n\n", "\n", ". ", " "] ) else: # 其他部分使用標準分割 splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=150 ) sub_docs = splitter.split_documents([doc]) # 保留原始節資訊 for sub_doc in sub_docs: sub_doc.metadata.update(doc.metadata) final_docs.extend(sub_docs) else: final_docs.append(doc) return final_docs 總結與最佳實踐 🎯 分割器選擇決策樹 文本類型判斷 ├── 一般文本 → RecursiveCharacterTextSplitter ├── 程式碼 → RecursiveCharacterTextSplitter.from_language() ├── Markdown → MarkdownHeaderTextSplitter + Recursive ├── JSON → RecursiveJsonSplitter ├── 高語意要求 → SemanticChunker └── 混合內容 → 混合策略 📏 參數設置指南 內容類型 chunk_size chunk_overlap 說明 短文檔 500-800 100-150 保持完整性 長文檔 1000-1500 200-300 平衡效率與質量 技術文檔 1200-2000 200-400 保留技術概念完整性 對話文本 300-600 50-100 保持對話流暢性 ⚡ 效能最佳化建議 預處理優化 清理多餘空白和特殊字符 統一換行符格式 移除不必要的元數據 分割策略優化 短文檔避免過度分割 長文檔採用分層分割 根據下游任務調整參數 記憶體管理 大文檔流式處理 及時釋放不需要的中間結果 使用生成器處理大量文檔 🚨 常見陷阱與解決方案 1. 過度分割問題 # ❌ 錯誤:chunk_size 設置過小 bad_splitter = RecursiveCharacterTextSplitter( chunk_size=100, # 太小了! chunk_overlap=50 ) # ✅ 正確:根據內容調整大小 good_splitter = RecursiveCharacterTextSplitter( chunk_size=800, # 適中的大小 chunk_overlap=100 ) 2. 重疊設置不當 # ❌ 錯誤:重疊太大或太小 bad_overlap = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=800 # 重疊 80%,太多了! ) # ✅ 正確:重疊 15-25% 比較適合 good_overlap = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 # 重疊 20% ) 3. 忽略語意邊界 def check_semantic_boundaries(docs): """檢查是否在不當位置分割""" issues = [] for i, doc in enumerate(docs): content = doc.page_content # 檢查是否在句子中間分割 if not content.rstrip().endswith(('.', '!', '?', '。', '!', '?')): issues.append(f"區塊 {i+1} 可能在句子中間分割") # 檢查是否分割了重要概念 if content.count(',') > content.count('。') * 3: issues.append(f"區塊 {i+1} 可能分割了長句或列表") return issues # 使用範例 issues = check_semantic_boundaries(docs) if issues: print("⚠️ 發現潛在分割問題:") for issue in issues: print(f" {issue}") 📊 分割效果評估工具 def comprehensive_split_analysis(original_text: str, docs: list): """全面的分割效果分析""" analysis = {} # 1. 基礎統計 chunk_sizes = [len(doc.page_content) for doc in docs] analysis['basic_stats'] = { 'total_chunks': len(docs), 'avg_size': sum(chunk_sizes) / len(chunk_sizes), 'size_std': (sum((x - analysis['basic_stats']['avg_size'])**2 for x in chunk_sizes) / len(chunk_sizes))**0.5, 'min_size': min(chunk_sizes), 'max_size': max(chunk_sizes) } # 2. 內容保留分析 total_split_length = sum(chunk_sizes) analysis['content_retention'] = { 'original_length': len(original_text), 'total_split_length': total_split_length, 'retention_rate': total_split_length / len(original_text) } # 3. 分佈分析 size_ranges = { 'very_small': len([s for s in chunk_sizes if s < 200]), 'small': len([s for s in chunk_sizes if 200 <= s < 500]), 'medium': len([s for s in chunk_sizes if 500 <= s < 1000]), 'large': len([s for s in chunk_sizes if 1000 <= s < 1500]), 'very_large': len([s for s in chunk_sizes if s >= 1500]) } analysis['size_distribution'] = size_ranges # 4. 語意完整性檢查(簡化) incomplete_chunks = 0 for doc in docs: content = doc.page_content.strip() if not content.endswith(('.', '!', '?', '。', '!', '?', '\n')): incomplete_chunks += 1 analysis['semantic_integrity'] = { 'incomplete_chunks': incomplete_chunks, 'completion_rate': 1 - (incomplete_chunks / len(docs)) } return analysis # 使用範例並生成報告 def print_analysis_report(analysis): """印出分析報告""" print("📊 文字分割分析報告") print("=" * 50) # 基礎統計 stats = analysis['basic_stats'] print(f"\n📈 基礎統計:") print(f" 總區塊數: {stats['total_chunks']}") print(f" 平均大小: {stats['avg_size']:.0f} 字符") print(f" 大小標準差: {stats['size_std']:.0f}") print(f" 大小範圍: {stats['min_size']} - {stats['max_size']}") # 內容保留 retention = analysis['content_retention'] print(f"\n💾 內容保留:") print(f" 原始長度: {retention['original_length']:,} 字符") print(f" 分割後總長度: {retention['total_split_length']:,} 字符") print(f" 保留率: {retention['retention_rate']:.1%}") # 大小分佈 dist = analysis['size_distribution'] print(f"\n📊 大小分佈:") for size_range, count in dist.items(): percentage = count / stats['total_chunks'] * 100 print(f" {size_range}: {count} 個 ({percentage:.1f}%)") # 語意完整性 integrity = analysis['semantic_integrity'] print(f"\n🎯 語意完整性:") print(f" 不完整區塊: {integrity['incomplete_chunks']}") print(f" 完整性評分: {integrity['completion_rate']:.1%}") # 建議 print(f"\n💡 最佳化建議:") if integrity['completion_rate'] < 0.8: print(" • 考慮調整分隔符優先級") print(" • 增加 chunk_overlap 大小") if dist['very_small'] > stats['total_chunks'] * 0.2: print(" • 過小區塊過多,考慮增加 chunk_size") if dist['very_large'] > stats['total_chunks'] * 0.1: print(" • 過大區塊存在,考慮減少 chunk_size") if stats['size_std'] > stats['avg_size'] * 0.5: print(" • 區塊大小變異過大,考慮使用 SemanticChunker") # 完整使用範例 splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) docs = splitter.create_documents([sample_text]) analysis = comprehensive_split_analysis(sample_text, docs) print_analysis_report(analysis) 🔧 實用工具函數 def optimize_split_parameters(text: str, target_chunk_count: int = None): """自動最佳化分割參數""" text_length = len(text) if target_chunk_count: optimal_chunk_size = text_length // target_chunk_count # 限制在合理範圍內 optimal_chunk_size = max(300, min(optimal_chunk_size, 2000)) else: # 根據文本長度自動調整 if text_length < 1000: optimal_chunk_size = 300 elif text_length < 5000: optimal_chunk_size = 800 elif text_length < 20000: optimal_chunk_size = 1200 else: optimal_chunk_size = 1500 optimal_overlap = int(optimal_chunk_size * 0.2) # 20% 重疊 return { 'chunk_size': optimal_chunk_size, 'chunk_overlap': optimal_overlap, 'estimated_chunks': text_length // (optimal_chunk_size - optimal_overlap) } def batch_process_with_different_splitters(texts: list): """批次測試不同分割器的效果""" splitters = { 'character': CharacterTextSplitter(chunk_size=1000, chunk_overlap=200), 'recursive': RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200), 'semantic': SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="percentile") } results = {} for splitter_name, splitter in splitters.items(): results[splitter_name] = [] for i, text in enumerate(texts): try: docs = splitter.create_documents([text]) analysis = comprehensive_split_analysis(text, docs) results[splitter_name].append({ 'text_id': i, 'chunk_count': len(docs), 'avg_chunk_size': analysis['basic_stats']['avg_size'], 'completion_rate': analysis['semantic_integrity']['completion_rate'] }) except Exception as e: print(f"❌ {splitter_name} 處理文本 {i} 時出錯: {e}") results[splitter_name].append(None) # 生成比較報告 print("\n📊 分割器比較報告:") print("-" * 60) for splitter_name, result_list in results.items(): valid_results = [r for r in result_list if r is not None] if valid_results: avg_chunks = sum(r['chunk_count'] for r in valid_results) / len(valid_results) avg_size = sum(r['avg_chunk_size'] for r in valid_results) / len(valid_results) avg_completion = sum(r['completion_rate'] for r in valid_results) / len(valid_results) print(f"\n{splitter_name.upper()}:") print(f" 平均區塊數: {avg_chunks:.1f}") print(f" 平均區塊大小: {avg_size:.0f}") print(f" 平均完整性: {avg_completion:.1%}") return results 結論 🎯 關鍵要點總結 選擇合適的分割器 一般文本:RecursiveCharacterTextSplitter 程式碼:語言專用分割器 結構化內容:專用格式分割器 高語意要求:SemanticChunker 參數調整原則 chunk_size:根據內容類型和下游任務調整 chunk_overlap:通常設為 chunk_size 的 15-25% 分隔符:根據內容結構自定義 品質保證 定期評估分割效果 監控語意完整性 根據實際效果調整策略 🚀 進階發展方向 智能分割:結合 NLP 技術的更精確語意邊界識別 多模態分割:處理包含圖片、表格的複雜文檔 動態調整:根據檢索效果自動最佳化分割參數 領域特化:針對特定領域的專用分割策略 文字分割是 RAG 系統中的關鍵環節,正確的分割策略能顯著提升系統的檢索準確性和生成品質。選擇最適合的分割器和參數,並持續監控和最佳化,是建構高效 RAG 系統的重要基礎。 本指南基於 LangChain 官方文檔和實際專案經驗編寫,持續更新以反映最新的技術發展和最佳實踐。