LangChain=>RecursiveCharacterTextSplitter
.Net版本LangChain源码:
RecursiveCharacterTextSplitter的作用是按照段落拆分文本。
调用方法:
var state_of_the_union_txt = "text-Content"; var textSplitter = new RecursiveCharacterTextSplitter(chunkSize: 300, chunkOverlap: 30); var texts = textSplitter.CreateDocuments(new List<string>() { state_of_the_union_txt });
RecursiveCharacterTextSplitter有4个参数:
public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
separators: 可以定义分隔符,如果不指定自定义的分隔符,默认的分隔符为:{ "\n\n", "\n", " ", "" };
chunkSize:段落的长度;
chunkOverlap:段落重叠部分的长度;
主要包含两个方法:SplitText 和 MergeSplits。
using LangChain.Base; namespace LangChain.TextSplitters; /// <summary> /// Implementation of splitting text that looks at characters. /// Recursively tries to split by different characters to find one /// that works. /// </summary> public class RecursiveCharacterTextSplitter:TextSplitter { private readonly List<string> _separators; public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction) { _separators = separators ?? new List<string> { "\n\n", "\n", " ", "" }; } public override List<string> SplitText(string text) { List<string> finalChunks = new List<string>(); string separator = _separators.Last(); foreach (string _s in _separators) { if (_s.Length == 0) { separator = _s; break; } if (text.Contains(_s)) { separator = _s; break; } } List<string> splits; if (separator.Length!=0) { splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList(); } else { splits = text.ToCharArray().Select(c => c.ToString()).ToList(); } List<string> goodSplits = new List<string>(); foreach (string s in splits) { if (s.Length < base.ChunkSize) { goodSplits.Add(s); } else { if (goodSplits.Any()) { List<string> mergedText = MergeSplits(goodSplits, separator); finalChunks.AddRange(mergedText); goodSplits.Clear(); } List<string> otherInfo = SplitText(s); finalChunks.AddRange(otherInfo); } } if (goodSplits.Any()) { List<string> mergedText = MergeSplits(goodSplits, separator); finalChunks.AddRange(mergedText); } return finalChunks; } }
1. SplitText 方法:
SplitText函数的主要功能是将输入的文本字符串按照一定的分隔符进行分割。
它首先尝试找到一个合适的分隔符(从 _separators 列表中选择),然后使用这个分隔符来分割文本。
如果没有找到合适的分隔符,它会将文本转换为字符数组,并将每个字符作为一个单独的元素。
然后,它会检查每个分割的部分,如果长度小于 base.ChunkSize,则将其添加到 goodSplits 列表中。
如果长度大于 base.ChunkSize,则将 goodSplits 列表中的元素合并,并将结果添加到 finalChunks 列表中。
最后,如果 goodSplits 列表中还有元素,它会再次进行合并,并将结果添加到 finalChunks 列表中。
using LangChain.Docstore; namespace LangChain.Base; /// <summary> /// Functionality for splitting text. /// <remarks> /// - ported from langchain/text_splitter.py /// /// </remarks> /// </summary> public abstract class TextSplitter { private readonly int _chunkSize; private readonly int _chunkOverlap; private readonly Func<string, int> _lengthFunction; protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string,int>? lengthFunction = null) { if (chunkOverlap > chunkSize) { throw new ArgumentException($"Chunk overlap ({chunkOverlap}) is greater than chunk size ({chunkSize})."); } _chunkSize = chunkSize; _chunkOverlap = chunkOverlap; _lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length); } protected int ChunkSize => _chunkSize; protected int ChunkOverlap => _chunkOverlap; public abstract List<string> SplitText(string text); /// <summary> /// Create documents from a list of texts. /// </summary> /// <exception cref="ArgumentException"> /// If the number of texts and metadata(when not null) are not equal, this method will throw an ArgumentException. /// </exception> public List<Document> CreateDocuments(List<string> texts, List<Dictionary<string, object>>? metadatas = null) { var documents = new List<Document>(); // if no metadata is provided, create a list of empty dictionaries metadatas ??= Enumerable.Repeat(new Dictionary<string, object>(), texts.Count).ToList(); if (texts.Count != metadatas.Count) { throw new ArgumentException("Number of texts and metadata must be equal."); } // each text is split into chunks, and each chunk is added to the list of documents for (int i = 0; i < texts.Count; i++) { var text = texts[i]; var metadata = metadatas[i]; foreach (var chunk in SplitText(text)) { documents.Add(new Document(chunk, metadata)); } } return documents; } public List<Document> SplitDocuments(List<Document> documents) { var texts = documents.Select(doc => doc.PageContent).ToList(); var metadatas = documents.Select(doc => doc.Metadata).ToList(); return CreateDocuments(texts, metadatas); } /// <summary> /// Joins a list of strings with a separator and returns null if the resulting string is empty /// </summary> protected string? JoinDocs(List<string> docs, string separator) { var text = string.Join(separator, docs).Trim(); return string.IsNullOrEmpty(text) ? null : text; } /// <summary> /// Merges a list of texts into chunks of size chunk_size with overlap /// </summary> protected List<string> MergeSplits(IEnumerable<string> splits, string separator) { var separatorLen = _lengthFunction(separator); var docs = new List<string>(); // result of chunks var currentDoc = new List<string>(); // documents of current chunk int total = 0; foreach (var split in splits) { int len = _lengthFunction(split); // if we can't fit the next split into current chunk if (total + len + (currentDoc.Count>0?separatorLen:0)>= _chunkSize) { // if the chunk is already was too big if (total > _chunkSize) { // todo: Implement a logger // todo: Log a warning about a split that is larger than the chunk size } if (currentDoc.Count > 0) { // join all the docs in current chunk and add to the result var doc = JoinDocs(currentDoc, separator); if (doc != null) { docs.Add(doc); } // start erasing docs from the beginning of the chunk until we can fit the next split while (total > _chunkOverlap || (total + len + (currentDoc.Count > 1 ? separatorLen : 0) > _chunkSize && total > 0)) { total -= _lengthFunction(currentDoc[0]) + (currentDoc.Count > 1 ? separatorLen : 0); currentDoc.RemoveAt(0); } } } // add the next split to the current chunk currentDoc.Add(split); total += len + (currentDoc.Count > 1 ? separatorLen : 0); // recalculate the total length of the current chunk } // add the last chunk var lastDoc = JoinDocs(currentDoc, separator); if (lastDoc != null) { docs.Add(lastDoc); } return docs; } // todo: Implement from_huggingface_tokenizer // todo: Implement from_tiktoken_encoder }
2. MergeSplits 方法:
这个方法的主要目的是将输入的分割部分(splits)合并成一个文档。
它首先计算分隔符的长度,然后遍历每个分割部分。如果当前文档(currentDoc)加上下一个分割部分的长度大于 _chunkSize,它会将当前文档的元素合并,并将结果添加到 docs 列表中。
然后,它会从当前文档的开始部分删除元素,直到可以容纳下一个分割部分。
最后,它会将最后一个文档的元素合并,并将结果添加到 docs 列表中。