LangChain=>RecursiveCharacterTextSplitter

 .Net版本LangChain源码:

github.com

RecursiveCharacterTextSplitter的作用是按照段落拆分文本

调用方法:

var state_of_the_union_txt = "text-Content";
var textSplitter = new RecursiveCharacterTextSplitter(chunkSize: 300, chunkOverlap: 30);
var texts = textSplitter.CreateDocuments(new List<string>() { state_of_the_union_txt });

RecursiveCharacterTextSplitter有4个参数:

public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)

separators: 可以定义分隔符,如果不指定自定义的分隔符,默认的分隔符为:{ "\n\n", "\n", " ", "" };

chunkSize:段落的长度;

chunkOverlap:段落重叠部分的长度;

主要包含两个方法:SplitText 和 MergeSplits。

using LangChain.Base;

namespace LangChain.TextSplitters;

/// <summary>
/// Implementation of splitting text that looks at characters.
/// Recursively tries to split by different characters to find one
/// that works.
/// </summary>
public class RecursiveCharacterTextSplitter:TextSplitter
{
    private readonly List<string> _separators;

    public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
    {
        _separators = separators ?? new List<string> { "\n\n", "\n", " ", "" };
    }

    public override List<string> SplitText(string text)
    {
        List<string> finalChunks = new List<string>();
        string separator = _separators.Last();

        foreach (string _s in _separators)
        {
            if (_s.Length == 0)
            {
                separator = _s;
                break;
            }

            if (text.Contains(_s))
            {
                separator = _s;
                break;
            }
        }

        List<string> splits;
        if (separator.Length!=0)
        {
            splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList();
        }
        else
        {
            splits = text.ToCharArray().Select(c => c.ToString()).ToList();
        }

    
        List<string> goodSplits = new List<string>();

        foreach (string s in splits)
        {
            if (s.Length < base.ChunkSize)
            {
                goodSplits.Add(s);
            }
            else
            {
                if (goodSplits.Any())
                {
                    List<string> mergedText = MergeSplits(goodSplits, separator);
                    finalChunks.AddRange(mergedText);
                    goodSplits.Clear();
                }

                List<string> otherInfo = SplitText(s);
                finalChunks.AddRange(otherInfo);
            }
        }

        if (goodSplits.Any())
        {
            List<string> mergedText = MergeSplits(goodSplits, separator);
            finalChunks.AddRange(mergedText);
        }

        return finalChunks;
    }
}
RecursiveCharacterTextSplitter

1. SplitText 方法:

SplitText函数的主要功能是将输入的文本字符串按照一定的分隔符进行分割。

它首先尝试找到一个合适的分隔符(从 _separators 列表中选择),然后使用这个分隔符来分割文本。

如果没有找到合适的分隔符,它会将文本转换为字符数组,并将每个字符作为一个单独的元素。

然后,它会检查每个分割的部分,如果长度小于 base.ChunkSize,则将其添加到 goodSplits 列表中。

如果长度大于 base.ChunkSize,则将 goodSplits 列表中的元素合并,并将结果添加到 finalChunks 列表中。

最后,如果 goodSplits 列表中还有元素,它会再次进行合并,并将结果添加到 finalChunks 列表中。

using LangChain.Docstore;

namespace LangChain.Base;

/// <summary>
/// Functionality for splitting text.
/// <remarks>
/// - ported from langchain/text_splitter.py
/// 
/// </remarks>
/// </summary>
public abstract class TextSplitter
{
    private readonly int _chunkSize;
    private readonly int _chunkOverlap;
    private readonly Func<string, int> _lengthFunction;

    

    protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string,int>? lengthFunction = null)
    {
        if (chunkOverlap > chunkSize)
        {
            throw new ArgumentException($"Chunk overlap ({chunkOverlap}) is greater than chunk size ({chunkSize}).");
        }

        _chunkSize = chunkSize;
        _chunkOverlap = chunkOverlap;
        _lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
    }

    protected int ChunkSize => _chunkSize;

    protected int ChunkOverlap => _chunkOverlap;

    public abstract List<string> SplitText(string text);

    /// <summary>
    /// Create documents from a list of texts.
    /// </summary>
    /// <exception cref="ArgumentException">
    /// If the number of texts and metadata(when not null) are not equal, this method will throw an ArgumentException.
    /// </exception>
    public List<Document> CreateDocuments(List<string> texts, List<Dictionary<string, object>>? metadatas = null)
    {
        var documents = new List<Document>();

        // if no metadata is provided, create a list of empty dictionaries
        metadatas ??= Enumerable.Repeat(new Dictionary<string, object>(), texts.Count).ToList();

        if (texts.Count != metadatas.Count)
        {
            throw new ArgumentException("Number of texts and metadata must be equal.");
        }


        // each text is split into chunks, and each chunk is added to the list of documents
        for (int i = 0; i < texts.Count; i++)
        {
            var text = texts[i];
            var metadata = metadatas[i];

            foreach (var chunk in SplitText(text))
            {
                documents.Add(new Document(chunk, metadata));
            }
        }

        return documents;
    }

    public List<Document> SplitDocuments(List<Document> documents)
    {
        var texts = documents.Select(doc => doc.PageContent).ToList();
        var metadatas = documents.Select(doc => doc.Metadata).ToList();

        return CreateDocuments(texts, metadatas);
    }

    /// <summary>
    /// Joins a list of strings with a separator and returns null if the resulting string is empty
    /// </summary>
    protected string? JoinDocs(List<string> docs, string separator)
    {
        var text = string.Join(separator, docs).Trim();
        return string.IsNullOrEmpty(text) ? null : text;
    }

    /// <summary>
    /// Merges a list of texts into chunks of size chunk_size with overlap
    /// </summary>
    protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
    {
        var separatorLen = _lengthFunction(separator);
        var docs = new List<string>(); // result of chunks
        var currentDoc = new List<string>(); // documents of current chunk
        int total = 0;

        foreach (var split in splits)
        {
            int len = _lengthFunction(split);
            
            // if we can't fit the next split into current chunk
            if (total + len + (currentDoc.Count>0?separatorLen:0)>= _chunkSize)
            {
                // if the chunk is already was too big
                if (total > _chunkSize)
                {
                    // todo: Implement a logger
                    // todo: Log a warning about a split that is larger than the chunk size
                }


                if (currentDoc.Count > 0)
                {
                    // join all the docs in current chunk and add to the result
                    var doc = JoinDocs(currentDoc, separator);
                    if (doc != null)
                    {
                        docs.Add(doc);
                    }

                    // start erasing docs from the beginning of the chunk until we can fit the next split
                    while (total > _chunkOverlap || (total + len + (currentDoc.Count > 1 ? separatorLen : 0) > _chunkSize && total > 0))
                    {
                        total -= _lengthFunction(currentDoc[0]) + (currentDoc.Count > 1 ? separatorLen : 0);
                        currentDoc.RemoveAt(0);
                    }
                }
            }

            // add the next split to the current chunk
            currentDoc.Add(split);
            total += len + (currentDoc.Count > 1 ? separatorLen : 0); // recalculate the total length of the current chunk
        }

        // add the last chunk
        var lastDoc = JoinDocs(currentDoc, separator);
        if (lastDoc != null)
        {
            docs.Add(lastDoc);
        }

        return docs;
    }

    // todo: Implement from_huggingface_tokenizer
    // todo: Implement from_tiktoken_encoder


}
TextSplitter

2. MergeSplits 方法:
这个方法的主要目的是将输入的分割部分(splits)合并成一个文档。

它首先计算分隔符的长度,然后遍历每个分割部分。如果当前文档(currentDoc)加上下一个分割部分的长度大于 _chunkSize,它会将当前文档的元素合并,并将结果添加到 docs 列表中。

然后,它会从当前文档的开始部分删除元素,直到可以容纳下一个分割部分。

最后,它会将最后一个文档的元素合并,并将结果添加到 docs 列表中。

posted @ 2023-11-06 13:32  新*  阅读(931)  评论(0编辑  收藏  举报