算法基础<九> 字符串查找

单词查找树

image-20210426091237214

单词查找树是由链接的结点所组成的数据结构,这些链接可能为空,也可能指向其他结点。

每个结点都只可能有一个指向它的结点,称为它的父结点(只有一个结点除外,即根结点,没有任何结点指向根结点)。

每个结点都含有R 条链接, 其中R为字母表的大小。

单词查找树一般都含有大最的空链接,因此在绘制一棵单词查找树时一般会忽略空链接。

尽管链接指向的是结点,但是也可以看作链接指向的是另一棵单词查找树,

值为空的结点在符号表中没有对应的键,它们的存在是为了简化单词查找树中的查找操作。

查找

  • 键的尾字符所对应的结点中的值非空,这是一次命中的查找~所对应的值就是键的尾字符所对应的结点中保存的值。

  • 键的尾字符所对应的结点中的值为空,这是一次未命中的查找一符号表中不存在被查找的键。

  • 查找结束于一条空链接这也是一次未命中的查找。

image-20210426092022878

插入

  • 在到达键的尾字符之前就遇到了一个空链接。在这种情况下,字符查找树中不存在与键的尾字符对应的结点,因此需要为键中还未被检查的每个字符创建一个对应的结点并将键的值保存到最后一个字符的结点中

  • 在遇到空链接之前就到达了键的尾字符。在这种情况下,和关联性数组一样,将该结点的值设为键所对应的值

image-20210426094028418

表示

将空链接考虑进来将会突出单词查找树的以下重要性质:

  • 每个结点都含有R 个链接,对应着每个可能出现的字符;
  • 字符和键均隐式地保存在数据结构中。

每个结点都含有一个值和26 个链接

image-20210426100100612

public class TrieST<Value> 
    { 
        private static readonly int R = 256;        // extended ASCII,基数


        private Node root;      // 树的根
        private int n;          // number of keys in trie

        // R-way trie node
        private  class Node
        {
            public Object val;
            public Node[] next = new Node[R];
        }

        public TrieST()
        {
        }


       /// <summary>
       /// 返回给定关联的值
       /// </summary>
       /// <param name="key"></param>
       /// <returns></returns>
        public Value Get(String key)
        {
            if (key == null) throw new ArgumentException("argument to get() is null");
            Node x = Get(root, key, 0);
            if (x == null) return default(Value);
            return (Value)x.val;
        }

        /// <summary>
        /// 此符号表是否包含给定的键
        /// </summary>
        /// <param name="key"></param>
        /// <returns></returns>
        public bool Contains(string key)
        {
            if (key == null) throw new ArgumentException("argument to contains() is null");
            return Get(key) != null;
        }

        private Node Get(Node x, String key, int d)
        {
            if (x == null) return null;
            if (d == key.Length) return x;
            char c = key[d];
            return Get(x.next[c], key, d + 1);
        }

        /// <summary>
        /// 将键值对插入符号表,覆盖旧值
        /// 如果键已在符号表中,则使用新值。
        /// </summary>
        /// <param name="key"></param>
        /// <param name="val"></param>
        public void Put(String key, Value val)
        {
            if (key == null) throw new ArgumentException("first argument to put() is null");
            if (val == null) Delete(key);
            else root = Put(root, key, val, 0);
        }

        private Node Put(Node x, String key, Value val, int d)
        {    //如果key存在于以x 为根结点的子单词查找树中则更新与它相关联的值
            if (x == null) x = new Node();
            if (d == key.Length)
            {
                if (x.val == null) n++;
                x.val = val;
                return x;
            }
            char c = key[d];////找到笫d 个字符所对应的子单词查找树
            x.next[c] = Put(x.next[c], key, val, d + 1);
            return x;
        }

        /// <summary>
        /// 返回此符号表中的键值对的数量。
        /// </summary>
        /// <returns></returns>
        public int Size()
        {
            return n;
        }

        /// <summary>
        /// 此符号表是否为空
        /// </summary>
        /// <returns></returns>
        public bool IsEmpty()
        {
            return Size() == 0;
        }

        /// <summary>
        /// 返回所有键
        /// </summary>
        /// <returns></returns>
        public IEnumerable<string> Keys()
        {
            return KeysWithPrefix("");
        }

        
        public IEnumerable<string> KeysWithPrefix(String prefix)
        {
            Queue<String> results = new Queue<String>();
            Node x = Get(root, prefix, 0);
            Collect(x, new StringBuilder(prefix), results);
            return results;
        }

        private void Collect(Node x, StringBuilder prefix, Queue<String> results)
        {
            if (x == null) return;
            if (x.val != null) results.Enqueue(prefix.ToString());
            for (char c = (char)0; c < R; c++)
            {
                prefix.Append(c);
               Collect(x.next[c], prefix, results);
                prefix.Remove(prefix.Length - 1, 1);
            }
        }

        /// <summary>
        /// 返回符号表中与pattern匹配的所有键,
        /// </summary>
        /// <param name="pattern"></param>
        /// <returns></returns>
        public IEnumerable<String> KeysThatMatch(String pattern)
        {
            Queue<String> results = new Queue<String>();
            Collect(root, new StringBuilder(), pattern, results);
            return results;
        }

        private void Collect(Node x, StringBuilder prefix, String pattern, Queue<String> results)
        {
            if (x == null) return;
            int d = prefix.Length;
            if (d == pattern.Length && x.val != null)
                results.Enqueue(prefix.ToString());
            if (d == pattern.Length)
                return;
            char c = pattern[d];
            if (c == '.')
            {
                for (char ch = (char)0; ch < R; ch++)
                {
                    prefix.Append(ch);
                    Collect(x.next[ch], prefix, pattern, results);
                    prefix.Remove(prefix.Length - 1, 1);
                }
            }
            else
            {
                prefix.Append(c);
                Collect(x.next[c], prefix, pattern, results);
                prefix.Remove(prefix.Length - 1, 1);
            }
        }

        /// <summary>
        /// 返回符号表中的字符串,该字符串是{@code query}的最长前缀,
        /// </summary>
        /// <param name="query"></param>
        /// <returns></returns>
        public String LongestPrefixOf(String query)
        {
            if (query == null) throw new ArgumentException("argument to longestPrefixOf() is null");
            int length = LongestPrefixOf(root, query, 0, -1);
            if (length == -1) return null;
            else return query.Substring(0, length);
        }

        /// <summary>
        /// 返回子菜单中最长字符串键的长度
        /// </summary>
        /// <param name="x"></param>
        /// <param name="query"></param>
        /// <param name="d"></param>
        /// <param name="length"></param>
        /// <returns></returns>
        private int LongestPrefixOf(Node x, String query, int d, int length)
        {
            if (x == null) return length;
            if (x.val != null) length = d;
            if (d == query.Length) return length;
            char c = query[d];
            return LongestPrefixOf(x.next[c], query, d + 1, length);
        }

        /// <summary>
        /// 如果存在key,则从集合中删除key
        /// </summary>
        /// <param name="key"></param>
        public void Delete(String key)
        {
            if (key == null) throw new ArgumentException("argument to delete() is null");
            root = Delete(root, key, 0);
        }

        private Node Delete(Node x, String key, int d)
        {
            if (x == null) return null;
            if (d == key.Length)
            {
                if (x.val != null) n--;
                x.val = null;
            }
            else
            {
                char c = key[d];
                x.next[c] = Delete(x.next[c], key, d + 1);
            }

            // remove subtrie rooted at x if it is completely empty
            if (x.val != null) return x;
            for (int c = 0; c < R; c++)
                if (x.next[c] != null)
                    return x;
            return null;
        }

    }

测试

        [TestMethod]
        public void TrieSTFixture()
        {
            var data = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Data\\shellsST.txt");
            Console.WriteLine(data);
            using (StreamReader stream = new StreamReader(data))
            {
                List<string> lines = new List<string>();
                while (!stream.EndOfStream)
                {
                    lines.AddRange(stream.ReadLine().Trim().Split(' '));
                }

                TrieST<int> st=new TrieST<int>();
                for (int i=0;i<lines.Count;i++)
                {
                    var key = lines[i];
                    st.Put(key,i);
                }
                CodeTimer.Time("TrieST select:", 1, () =>
                {
                    var value = st.LongestPrefixOf("s");
                   Console.WriteLine(value??"null");
                });


                foreach (var value in st.KeysWithPrefix("s"))
                {
                    Console.WriteLine(value);
                }
                

            }

字母表的大小为R, 在一棵由N 个随机键构造的单词查找树中,未命中查找平均所需检查的结点数量为~logN。

每个节点都有R个链接,对应可能出现的字符。造成查找浪费。

三向单词查找树

三向单词查找树(TST) 。在三向单词查找树中,每个结点都含有一个字符、三条链接和一个值。这三条链接分别对应着当前字母小于、等于和大于结点字母的所有键。

如果遇到了一个空链接或者当键结束时结点的值为空,那么查找未命中;如果键结束时结点的值非空则查找命中。在插入一个新键时,首先进行查找,然后和在单词查找树一样,在树中补全键末尾的所有结点。

image-20210505214210607

image-20210505214236381

    public class TST<Value>
    {
        private int n;              // 树的大小
        private Node<Value> root;   // 树的根节点

        public  class Node<Value>
        {
            public char c;                        // 字符
            public Node<Value> left, mid, right;  // 左中右子三向单词查找树
            public Value val;                     // 和宇符串相关联的值
        }

        /// <summary>
        /// 初始化一个空的字符串表
        /// Initializes an empty string symbol table.
        /// </summary>
        public TST()
        {
        }

        
        public int Size()
        {
            return n;
        }

        
        public bool Contains(string key)
        {
            if (key == null)
            {
                throw new ArgumentNullException("argument to contains() is null");
            }
            return Get(key) != null;
        }

        
        public Value Get(string key)
        {
            if (key == null)
            {
                throw new ArgumentNullException("calls get() with null argument");
            }
            if (key.Length == 0) throw new ArgumentException("key must have length >= 1");
            Node<Value> x = Get(root, key, 0);
            if (x == null) return default(Value);
            return x.val;
        }

        // return subtrie corresponding to given key
        private Node<Value> Get(Node<Value> x, String key, int d)
        {
            if (x == null) return null;
            if (key.Length == 0) throw new ArgumentException("key must have length >= 1");
            char c = key[d];
            if (c < x.c) return Get(x.left, key, d);
            else if (c > x.c) return Get(x.right, key, d);
            else if (d < key.Length - 1) return Get(x.mid, key, d + 1);
            else return x;
        }

        public void Put(String key, Value val)
        {
            if (key == null)
            {
                throw new ArgumentNullException("calls put() with null key");
            }
            if (!Contains(key)) n++;
            else if (val == null) n--;       // delete existing key
            root = Put(root, key, val, 0);
        }

        private Node<Value> Put(Node<Value> x, String key, Value val, int d)
        {
            char c = key[d];
            if (x == null)
            {
                x = new Node<Value>();
                x.c = c;
            }
            if (c < x.c) x.left = Put(x.left, key, val, d);
            else if (c > x.c) x.right = Put(x.right, key, val, d);
            else if (d < key.Length - 1) x.mid = Put(x.mid, key, val, d + 1);
            else x.val = val;
            return x;
        }

        /// <summary>
        /// 返回符号表中的字符串,该字符串是query的最长前缀,
        /// </summary>
        /// <param name="query"></param>
        /// <returns></returns>
        public String LongestPrefixOf(String query)
        {
            if (query == null)
            {
                throw new ArgumentNullException("calls longestPrefixOf() with null argument");
            }
            if (query.Length == 0) return null;
            int length = 0;
            Node<Value> x = root;
            int i = 0;
            while (x != null && i < query.Length)
            {
                char c = query[i];
                if (c < x.c) x = x.left;
                else if (c > x.c) x = x.right;
                else
                {
                    i++;
                    if (x.val != null) length = i;
                    x = x.mid;
                }
            }
            return query.Substring(0, length);
        }

        
        public IEnumerable<string> Keys()
        {
            Queue<string> queue = new Queue<string>();
            Collect(root, new StringBuilder(), queue);
            return queue;
        }

        /// <summary>
        /// 返回集合中以{@code prefix}开头的所有键。
        /// </summary>
        /// <param name="prefix"></param>
        /// <returns></returns>
        public IEnumerable<String> KeysWithPrefix(String prefix)
        {
            if (prefix == null)
            {
                throw new ArgumentNullException("calls keysWithPrefix() with null argument");
            }
            Queue<String> queue = new Queue<String>();
            Node<Value> x = Get(root, prefix, 0);
            if (x == null) return queue;
            if (x.val != null) queue.Enqueue(prefix);
            Collect(x.mid, new StringBuilder(prefix), queue);
            return queue;
        }

        /// <summary>
        /// subtrie中所有以x为根且具有给定前缀的键
        /// </summary>
        /// <param name="x"></param>
        /// <param name="prefix"></param>
        /// <param name="queue"></param>
        private void Collect(Node<Value> x, StringBuilder prefix, Queue<String> queue)
        {
            if (x == null) return;
            Collect(x.left, prefix, queue);
            if (x.val != null) queue.Enqueue(prefix.ToString() + x.c);
            Collect(x.mid, prefix.Append(x.c), queue);
            prefix.Remove(prefix.Length - 1, 1);
            Collect(x.right, prefix, queue);
        }


        /// <summary>
        /// 返回符号表中与{@code pattern}匹配的所有键,其中。 符号被视为通配符。
        /// </summary>
        /// <param name="pattern"></param>
        /// <returns></returns>
        public IEnumerable<String> KeysThatMatch(String pattern)
        {
            Queue<String> queue = new Queue<String>();
            Collect(root, new StringBuilder(), 0, pattern, queue);
            return queue;
        }

        private void Collect(Node<Value> x, StringBuilder prefix, int i, String pattern, Queue<String> queue)
        {
            if (x == null) return;
            char c = pattern[i];
            if (c == '.' || c < x.c) Collect(x.left, prefix, i, pattern, queue);
            if (c == '.' || c == x.c)
            {
                if (i == pattern.Length - 1 && x.val != null) queue.Enqueue(prefix.ToString() + x.c);
                if (i < pattern.Length - 1)
                {
                    Collect(x.mid, prefix.Append(x.c), i + 1, pattern, queue);
                    prefix.Remove(prefix.Length - 1,1);
                }
            }
            if (c == '.' || c > x.c) Collect(x.right, prefix, i, pattern, queue);
        }
    }

测试

        [TestMethod]
        public void TSTFixture()
        {
            var data = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Data\\shellsST.txt");
            Console.WriteLine(data);
            using (StreamReader stream = new StreamReader(data))
            {
                List<string> lines = new List<string>();
                while (!stream.EndOfStream)
                {
                    lines.AddRange(stream.ReadLine().Trim().Split(' '));
                }

                TST<int> st = new TST<int>();
                for (int i = 0; i < lines.Count; i++)
                {
                    var key = lines[i];
                    st.Put(key, i);
                }



                CodeTimer.Time("TST select:", 1, () =>
                {
                    var value = st.LongestPrefixOf("s");
                    Console.WriteLine(value ?? "null");
                });


                foreach (var value in st.KeysWithPrefix("s"))
                {
                    Console.WriteLine(value);
                }


            }
            //TST select:s
            //	Time Elapsed:		2ms
            //	Time Elapsed (one time):2ms
            //	CPU time:		0ns
            //	CPU time (one time):	0ns
            //	Gen 0: 			0
            //	Gen 1: 			0
            //	Gen 2: 			0
        }

由N 个平均长度为w 的字符串构造的三向单词查找树中的链接总数在3N 到3Nw 之间。

字符串符号表

image-20210505221414878

posted @ 2021-05-05 22:19  阿杜888  阅读(168)  评论(0编辑  收藏  举报