A Simple C++ Template Class that Matches a String to a Wildcard Pattern

A recently implemented enhanced wildcard string matcher, features of which including,

  • Supporting wildcard character '*' for matching zero or more characters
  • Supporting wildcard character '?' for matching exactly one character
  • Supporting parentheses '(' and ')' for referencing the matches
  • Supporting escape character (back-slash)

C++ features demonstrated by this implementation,

  • Functors with a consideration of possible function pointers/user instantiated functors with user data
  • Specialized templates
  • Template rebinding

The implementation is maintained as part of the ongoing project of quanben's C++ template library qcpplib publicly on github at https://github.com/lincolnyu/qcpplib/

The current snapshot of the code is following,

  1 //
  2 //  qcpplib v1.00
  3 //  quanben's C++ template library
  4 //  
  5 //  Author Lincoln Yu
  6 //
  7 //  lincoln.yu@gmail.com
  8 //  https://github.com/lincolnyu/qcpplib
  9 //
 10 //  The MIT License (MIT)
 11 // 
 12 //  Copyright (c) <year> <copyright holders>
 13 // 
 14 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 15 //  of this software and associated documentation files (the "Software"), to deal
 16 //  in the Software without restriction, including without limitation the rights
 17 //  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 18 //  copies of the Software, and to permit persons to whom the Software is
 19 //  furnished to do so, subject to the following conditions:
 20 // 
 21 //  The above copyright notice and this permission notice shall be included in
 22 //  all copies or substantial portions of the Software.
 23 // 
 24 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 25 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 26 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 27 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 28 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 29 //  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 30 //  THE SOFTWARE.
 31 //
 32 
 33 #if !defined (_WILDCARD_H_)
 34 #define _WILDCARD_H_
 35 
 36 #include <map>
 37 #include <vector>
 38 #include <cstring>
 39 #include <string>
 40 
 41 /// @brief Contains class definitions that deal with wildcard matching
 42 namespace Qtl { namespace String { namespace Wildcard {
 43 
 44     /// @brief An implementation of the functor that returns the length of string whose iterators are applicable 
 45     ///        to subtract operator
 46     template <class TStringRef, class TSubtractableIter>
 47     struct CharDistFunctorIndexed
 48     {
 49         size_t operator()(TStringRef str, TSubtractableIter iterBegin, TSubtractableIter iterEnd)
 50         {
 51             return (iterEnd-iterBegin);
 52         }
 53     };
 54     
 55     /// @brief An implementation of the functor that returns the position of the first character for character-based 
 56     ///        zero-terminated string
 57     struct StringBeginFunctorPsz
 58     {
 59         char * operator()(char *str)
 60         {
 61             return (str);
 62         }
 63     };
 64 
 65     /// @brief An implementation of the functor that returns the position of the first character for a std::string
 66     struct StringBeginFunctorStdStr
 67     {
 68         std::string::const_iterator operator()(const std::string& str)
 69         {
 70             return str.begin();
 71         }
 72     };
 73     
 74     /// @brief An implementation of the functor that determines if the position is at the end of a character-based
 75     ///        zero-terminated string
 76     struct StringEndFunctorPsz
 77     {
 78         bool operator()(char *iter, char *str)
 79         {
 80             return (*iter == 0);
 81         }
 82     };
 83 
 84     /// @brief An implementation of the functor that determines if the position is at the end of a std::string
 85     struct StringEndFunctorStdStr
 86     {
 87         bool operator()(std::string::const_iterator iter, const std::string& str)
 88         {
 89             return (iter==str.end());
 90         }
 91     };
 92 
 93     /// @brief An implementation of the functor that appends a character to a character-based zero-terminated string
 94     struct AppendCharFunctorPsz
 95     {
 96         void operator()(char *str, char *&iter, char ch)
 97         {
 98             *iter++ = ch;
 99         }
100     };
101 
102     /// @brief An implementation of the functor that appends a character to a std::string
103     struct AppendCharFunctorStdStr
104     {
105         void operator()(std::string &str, std::string::iterator &iter, char ch)
106         {
107             str.push_back(ch);
108         }
109     };
110 
111     /// @brief The default class that provides string functors
112     struct DefaultStringFunctorSelector
113     {
114         /// @brief The generic rebinder
115         template <class TStringRef, class TCharIter>
116         struct rebind
117         {
118             // unimplemented, compiler error occurs if getting here
119         };
120         
121         /// @brief The rebinder to the character array based string functors
122         template <>
123         struct rebind<char*,char*>
124         {
125             typedef StringBeginFunctorPsz                StringBeginFunctor;
126             typedef StringEndFunctorPsz                StringEndFunctor;
127             typedef CharDistFunctorIndexed<char*,char*>    CharDistFunctor;
128         };
129 
130         /// @brief The rebinder to the std::string based string functors
131         template <>
132         struct rebind<const std::string&, std::string::const_iterator>
133         {
134             typedef StringBeginFunctorStdStr            StringBeginFunctor;
135             typedef StringEndFunctorStdStr                StringEndFunctor;
136             typedef CharDistFunctorIndexed<std::string::const_iterator, std::string::const_iterator>    CharDistFunctor;
137         };
138     };
139 
140     /// @brief The default class that provides functor that appends character to string
141     struct DefaultAppendCharFunctorSelector
142     {
143         /// @brief The generic binder
144         template <class TStringRef, class TCharIter, class TChar>
145         struct rebind
146         {
147             // unimplemented, compiler error occurs if getting here
148         };
149 
150         /// @brief The rebinder to the functor that appends character to character-based zero-terminating string
151         template <>
152         struct rebind<char*, char*, char>
153         {
154             typedef AppendCharFunctorPsz    AppendCharFunctor;
155         };
156 
157         /// @brief The rebinder to the functor that appends character to std::string
158         template <>
159         struct rebind<std::string&, std::string::iterator, char>
160         {
161             typedef AppendCharFunctorStdStr    AppendCharFunctor;
162         };
163     };
164     
165     /// @brief A class that encapsulates a wildcard pattern
166     /// @param TString The type of the pattern string
167     /// @param TStringRef The type of the reference to the pattern string (for efficient parameter passing)
168     /// @param TCharIter The type of the iterator through the characters
169     /// @param TStringBeginFunctor The type of the functor that returns the iterator at the beginning of a string
170     /// @param TStringEndFunctor The type of the functor that determines if the iterator is at the end of a string
171     template <class TString=char*, class TStringRef=char*, class TCharIter=char*,
172         class TStringFunctorSelector=DefaultStringFunctorSelector>
173     class Pattern
174     {
175     public:
176         typedef TStringRef StringRef;
177 
178         /// @brief The type of iterator through the characters in the pattern string
179         typedef TCharIter CharIter;
180         
181         /// @brief The type of the functor that returns the iterator at the beginning of a string
182         typedef typename TStringFunctorSelector::template rebind<TStringRef, TCharIter>::StringBeginFunctor    StringBeginFunctor;
183         
184         /// @brief The type of the functor that determines if the iterator is at the end of a string
185         typedef typename TStringFunctorSelector::template rebind<TStringRef, TCharIter>::StringEndFunctor    StringEndFunctor;
186 
187         /// @brief The type of the functor that returns the distance between two characters 
188         typedef typename TStringFunctorSelector::template rebind<TStringRef, TCharIter>::CharDistFunctor CharDistFunctor;
189 
190     private:
191         /// @brief The pattern string
192         TString                 _pattern;
193         
194         /// @brief The functor that returns the beginning of the string
195         StringBeginFunctor         _getStringBegin;
196         
197         /// @brief The functor that returns if the iterator is at the end of the string
198         StringEndFunctor         _isStringEnd;
199 
200         /// @brief The functor that returns the distance between two characters
201         CharDistFunctor            _getCharDist;
202         
203         /// @brief The look-up table that maps iterator of pattern to the index of match result entry
204         std::map<CharIter, int> _mapIterToIndex;
205 
206     public:
207         // a typical wildcard pattern: 
208         //   a*b?C(*)
209         // 
210         /// @brief Instantiates a pattern with the pattern string and the functors
211         /// @param pattern The pattern string
212         /// @param stringBegin The functor that provides the beginning of the string
213         /// @param stringEnd The functor that determines the end of the string
214         /// @remarks A typical wildcard pattern is like: a*b?C(*)D\)
215         ///          where normal characters (alphanumerics, punctuation etc) expect exact match, asteroids match whatever 
216         ///          string of whatever length, question marks match any single character and an escape character 
217         ///          (back-slash) turns a succeeding special character to a normal matching character.
218         Pattern(TStringRef pattern, StringBeginFunctor stringBegin, StringBeginFunctor stringEnd) 
219             : _pattern(pattern), _getStringBegin(stringBegin), _isStringEnd(stringEnd)
220         {
221             PreProcessParentheses();
222         }
223 
224         /// @brief Instantiates a pattern with the pattern string
225         /// @param pattern The pattern string
226         Pattern(TStringRef pattern) : _pattern(pattern)
227         {
228             PreProcessParentheses();
229         }
230 
231     private:
232         /// @brief Creates the mapping from parenthesis pointer to index from the pattern string
233         void PreProcessParentheses()
234         {
235             _mapIterToIndex.clear();
236             int openingIndex = 0;
237             int closingIndex = 0;
238             for (CharIter iter = GetBegin(); !IsEnd(iter); ++iter)
239             {
240                 if (*iter=='\\')
241                 {
242                     ++iter;    // skip the character that follows
243                 }
244                 else if (*iter == '(')
245                 {
246                     _mapIterToIndex[iter] = closingIndex = openingIndex++;
247                 }
248                 else if (*iter == ')')
249                 {
250                     // NOTE We don't need to differentiate opening and closing parentheses as
251                     //      the matcher has the knowledge of the pattern characters
252                     _mapIterToIndex[iter] = closingIndex--;
253                 }
254             }
255         }
256 
257     public:
258         /// @brief Returns the beginning of the pattern string
259         /// @return The iterator point to the beginning of the pattern string
260         CharIter GetBegin()
261         {
262             return _getStringBegin(_pattern);
263         }
264 
265         /// @brief Determines if the iterator is at the end of the pattern string
266         /// @param The interator in question
267         /// @return true if the iterator is at the beginning of the pattern string
268         bool IsEnd(CharIter iter)
269         {
270             return _isStringEnd(iter, _pattern);
271         }
272 
273         /// @brief Returns the match entry index for the specified parenthesis pointer
274         /// @return The match entry index
275         int PatternIterToIndex(CharIter patternIter)
276         {
277             return _mapIterToIndex[patternIter];
278         }
279 
280         /// @brief Returns the distance between two characters (the number of characters in between plus one)
281         /// @param iterBegin The iterator that points to the character on the left hand
282         /// @param iterEnd The iterator that points to the character on the right hand
283         /// @return The distance
284         size_t GetQuotedLength(CharIter iterBegin, CharIter iterEnd)
285         {
286             return _getCharDist(_pattern, iterBegin, iterEnd);
287         }
288     };
289 
290     /// @brief A class that converts a wildcard pattern to its equivalent regular expression
291     /// @param TPattern The type of the pattern class 
292     /// @param TRegexStringRef The type of the reference to the string for regular expression
293     /// @param TRegexCharIter The iterator through characters in the string for regular expression
294     /// @param TPatternFunctorSelector The functor selector for pattern
295     /// @param TRegexAppendCharFunctorSelector The append-character functor selector for regular expression
296     /// @remarks NOTE TRegexChar has to be compatible with the character type TPattern::CharIter iterates through
297     template <class TPattern=Pattern<>, class TRegexStringRef=char*, class TRegexCharIter=char*, 
298         class TRegexChar=char, class TRegexAppendCharFunctorSelector=DefaultAppendCharFunctorSelector>
299     class WildCardToRegex
300     {
301     public:
302         /// @brief The type of the reference to regular expression string
303         typedef TRegexStringRef        RegexStringRef;
304         /// @brief The type of the iteartor through the characters in the regular expression string
305         typedef TRegexCharIter        RegexCharIter;
306         /// @brief The type of the character that can be append to the regular expression string
307         typedef TRegexChar            RegexChar;
308 
309         /// @brief The type of the reference to the wildcard string
310         typedef typename TPattern::StringRef    PatternStringRef;
311         /// @brief The type of the iterator through the characters in the wildcard string
312         typedef typename TPattern::CharIter        PatternStringIter;
313 
314         /// @brief
315         typedef typename TRegexAppendCharFunctorSelector::template rebind<RegexStringRef, RegexCharIter, RegexChar>::AppendCharFunctor
316             RegexAppendCharFunctor;
317 
318     private:
319         /// @brief The functor that appends character to the regular expression string
320         RegexAppendCharFunctor _regexAppendChar;
321 
322     public:
323         /// @brief Initialises a WildCardToRegex with the specified functor instances
324         /// @param regexAppendChar The functor that appends character to the regular expression string
325         WildCardToRegex(RegexAppendCharFunctor &regexAppendChar)
326             : _regexAppendChar(regexAppendChar)
327         {
328         }
329 
330         /// @brief Initialises a WildCardToRegex with the default settings
331         WildCardToRegex()
332         {
333         }
334 
335     public:
336         /// @brief Converts a wildcard string to its equivalent regular expression
337         /// @remarks This is supposed to comply with the rules set by the regex implementation in QSharp
338         ///          See https://qsharp.codeplex.com/SourceControl/latest#QSharp/QSharp.String.Rex/Creator.cs
339         ///          for more detail. It has yet to be tested though.
340         void Convert(TPattern &pattern, TRegexStringRef regex, TRegexCharIter iterRegex)
341         {
342             for (PatternStringIter iter = pattern.GetBegin(); !pattern.IsEnd(iter); ++iter)
343             {
344                 switch (*iter)
345                 {
346                 case '\\':
347                     _regexAppendChar(regex, iterRegex, *iter);
348                     ++iter;
349                     if (!pattern.IsEnd(iter))
350                     {
351                         _regexAppendChar(regex, iterRegex, *iter);
352                     }
353                     else
354                     {
355                         _regexAppendChar(regex, iterRegex, '\\');
356                     }
357                     break;
358                 case '*':
359                     _regexAppendChar(regex, iterRegex, '.');
360                     _regexAppendChar(regex, iterRegex, *iter);
361                     break;
362                 case '?':
363                     _regexAppendChar(regex, iterRegex, '.');
364                     break;
365                 case '(': case ')':
366                     _regexAppendChar(regex, iterRegex, *iter);
367                     break;
368                 case '[': case ']': case '{': case '}': case '^': case '.': case '-': case '+':
369                     _regexAppendChar(regex, iterRegex, '\\');
370                     _regexAppendChar(regex, iterRegex, *iter);
371                     break;
372                 default:
373                     _regexAppendChar(regex, iterRegex, *iter);
374                     break;
375                 }
376             }
377         }
378     };
379 
380     /// @brief A class that represents a match of quotation enclosed by a pair of parentheses in the pattern
381     /// @param TCharIter The type of iterator through the source string
382     /// @param TDiff The type of a integer number that indicates the length of string or the distance between characters
383     template <class TCharIter=char*, class TDiff=size_t>
384     class MatchQuote
385     {
386     public:
387         /// @brief The type of iterator through the source string
388         typedef TCharIter    CharIter;
389 
390         /// @brief The type of a integer number that indicates the length of string or the distance between characters
391         typedef TDiff        Diff;
392 
393     public:
394         /// @brief The beginning of the substring that matches
395         CharIter Begin;
396 
397         /// @brief The end of the substring that matches
398         CharIter End;
399     };    
400     
401     /// @brief A class that contains all the matched quotations
402     /// @param TCharIter The iterator through the source string
403     /// @param TDiff The type of the integer that indicates a string length or a character distance
404     template <class TCharIter=char*, class TDiff=size_t>
405     class MatchResult
406     {
407     public:
408         /// @brief The iterator through the source string
409         typedef TCharIter        CharIter;
410         /// @brief The type of the integer that indicates a string length or a character distance
411         typedef TDiff            Diff;
412         /// @brief The type of match entries listed in this object
413         typedef MatchQuote<CharIter, Diff>  MatchType;
414 
415     public:
416         /// @brief A list of matched quotation entries
417         std::vector<MatchType> Matches;
418 
419     public:
420         /// @brief Records the beginning of a quotation encountered
421         /// @param index The index of the match entry
422         /// @param iterChar The pointer to the source string where the quotation starts
423         void Open(int index, CharIter iterChar)
424         {
425             while (index >= Matches.size())
426             {
427                 Matches.push_back(MatchType());
428             }
429             Matches[index].Begin = iterChar;
430         }
431         
432         /// @brief Records the end of a quotation encountered
433         /// @param index The index of the match entry
434         /// @param iterChar The pointer to the source string where the quotation ends
435         void Close(int index, CharIter iterChar)
436         {
437             // cell index must have already been allocated in the array of Matches
438             Matches[index].End = iterChar;
439         }
440     };
441 
442     /// @brief A default trait class that provides types needed by Matcher
443     /// @param TChar 
444     template <class TStringRef=char*, class TCharIter=char*, class TDiff=size_t,
445         class TStringFunctorSelector=DefaultStringFunctorSelector>
446     struct MatcherTraits
447     {
448         /// @brief The type of the reference to the source string 
449         typedef TStringRef     StringRef;
450         /// @brief The type of iterator through the characters in the source string
451         typedef TCharIter      CharIter;
452         
453         /// @brief The type of the match result (matched quotation entry container)
454         typedef MatchResult<TCharIter, TDiff>    MatchResultType;
455         /// @brief The type of the reference to the match result
456         typedef MatchResultType &                MatchResultRef;
457         
458         /// @brief The type of the functor that returns the beginning of a string
459         typedef typename TStringFunctorSelector::template rebind<StringRef, CharIter>::StringBeginFunctor    StringBeginFunctor;
460         /// @brief The type of the functor that determines if an iterator is at the end of a string
461         typedef typename TStringFunctorSelector::template rebind<StringRef, CharIter>::StringEndFunctor        StringEndFunctor;
462     };
463     
464     /// @brief A wildcard string matcher
465     template <class Traits = MatcherTraits<>>
466     class Matcher
467     {
468     public:
469         /// @brief The type of the reference to the source string 
470         typedef typename Traits::StringRef            StringRef;
471         /// @brief The type of iterator through the characters in the source string
472         typedef typename Traits::CharIter             CharIter;
473         
474         /// @brief The type of the reference to the match result
475         typedef typename Traits::MatchResultRef        MatchResultRef;
476         
477         /// @brief The type of the functor that returns the beginning of a string
478         typedef typename Traits::StringBeginFunctor    StringBeginFunctor;
479         /// @brief The type of the functor that determines if an iterator is at the end of a string
480         typedef typename Traits::StringEndFunctor    StringEndFunctor;
481 
482     private:
483         /// @brief The functor that returns the beginning of a string
484         StringBeginFunctor     _stringBegin;
485 
486         /// @brief The functor that determines if an iterator is at the end of a string
487         StringEndFunctor    _stringEnd;
488                 
489     public:
490         /// @brief Instantiates a Matcher with the specified string functor instances
491         /// @param stringBegin The functor that returns the beginning of a string
492         /// @param stringEnd The functor that determines if an iterator is at the end of a string
493         Matcher(StringBeginFunctor &stringBegin, StringEndFunctor &stringEnd)
494             : _stringBegin(stringBegin), _stringEnd(stringEnd)
495         {
496         }
497         
498         /// @brief Instantiates a Matcher with default settings
499         Matcher()
500         {
501         }
502         
503     public:
504         /// @brief Match The source to the pattern
505         /// @param source The source string to match
506         /// @param pattern The pattern to match against
507         /// @param matchResult The container of matched quotation entries
508         /// @return true if the matching is successful (the pattern is completely consumed)
509         template <class TPattern>
510         bool Match(StringRef source, TPattern &pattern, MatchResultRef matchResult)
511         {
512             CharIter iterSource = _stringBegin(source);
513             TPattern::CharIter iterPattern = pattern.GetBegin();
514             return Match(source, iterSource, pattern, iterPattern, matchResult);
515         }
516         
517         /// @brief Match the source to the pattern (recursive)
518         /// @param source The source string to match
519         /// @param iterSource The iterator through the source string at its current position
520         /// @param pattern The pattern to match against
521         /// @param iterPattern The iterator through the pattern string at its current position
522         /// @param matchResult The container of matched quotation entries
523         /// @return true if the matching is successful (the pattern is completely consumed)
524         template <class TPattern>
525         bool Match(StringRef source, CharIter &iterSource, TPattern &pattern, typename TPattern::CharIter &iterPattern, 
526             MatchResultRef matchResult)
527         {
528             while (! pattern.IsEnd(iterPattern))
529             {
530                 if (*iterPattern == '\\')
531                 {
532                     ++iterPattern;
533                 }
534                 else if (*iterPattern == '*')
535                 {
536                     CharIter savedIterSource = iterSource;
537                     TPattern::CharIter savedIterPattern = iterPattern;
538                     // greedy strategy
539                     if (!_stringEnd(savedIterSource, source))
540                     {
541                         ++iterSource;
542                         if (Match(source, iterSource, pattern, savedIterPattern, matchResult))
543                         {
544                             return true;
545                         }
546                     }
547                     ++iterPattern;
548                     if (Match(source, savedIterSource, pattern, iterPattern, matchResult))
549                     {
550                         return true;
551                     }
552                     return false;
553                 }
554                 else if (*iterPattern == '?')
555                 {
556                     if (_stringEnd(iterSource, source))
557                     {
558                         return false;
559                     }
560                     ++iterPattern;
561                     ++iterSource;
562                     continue;
563                 }
564                 else if (*iterPattern == '(')
565                 {
566                     int index = pattern.PatternIterToIndex(iterPattern);
567                     matchResult.Open(index, iterSource);
568                     ++iterPattern;
569                     continue;
570                 }
571                 else if (*iterPattern == ')')
572                 {
573                     int index = pattern.PatternIterToIndex(iterPattern);
574                     matchResult.Close(index, iterSource);
575                     ++iterPattern;
576                     continue;
577                 }
578                 
579                 if (!_stringEnd(iterSource, source) && *iterPattern == *iterSource)
580                 {
581                     ++iterPattern;
582                     ++iterSource;
583                 }
584                 else
585                 {
586                     return false;
587                 }
588             }
589             return true;
590         }
591     };
592 }}}
593 
594 #endif

 

 

posted @ 2013-09-21 21:22  quanben  阅读(922)  评论(0编辑  收藏  举报