c++ 正则表达式

https://www.runoob.com/regexp 入门必看

对于查找模式为：string e="(http://)?(www\.baidu\.com)"s;

string input=R"(this is prefix http://www.baidu.com this is suffix)";

对应字符串做

smatch sm;（smatch是strig match；cmatch是 char match。

regex_search(input, sm, regex{e})

匹配结果会放到sm中。

this is prefix 部分会放到sm.prefix()中，sm.prefix().matched是true。sm.prefix().str()取出串。

（）是提取group。这里的模式匹配里有两个，所以会有两个提取出，分别是sm[1],sm[2]。如果匹配到，相应的matched会是true。sm[n]就是第n个被匹配出的。n<11个。

子串匹配后剩余部分是sm.suffix()。

regex_match：全文匹配，要求整个字符串符合正则表达式的匹配规则。用来判断一个字符串和一个正则表达式是否模式匹配，如果匹配成功则返回true，否则返回false。

regex_search：搜索匹配，根据正则表达式来搜索字符串中是否存在符合规则的子字符串。

regex_replace：替换匹配，即可以将符合匹配规则的子字符串替换为其他字符串。要求输入一个正则表达式，以及一个用于替换匹配子字符串的格式化字符串。这个格式化字符串可以通过转义序列引用匹配子字符串中的部分内容。

std::regex_search和std::regex_match的区别：

std::regex re("Get|GetValue");
std::cmatch m;
std::regex_search("GetValue", m, re);  // returns true, and m[0] contains "Get" 尽快查找
std::regex_match ("GetValue", m, re);  // returns true, and m[0] contains "GetValue" 最大化取匹配
std::regex_search("GetValues", m, re); // returns true, and m[0] contains "Get" 查找到子串就算
std::regex_match ("GetValues", m, re); // returns false 完全匹配

？：语义一般与 | 或连用。这样它匹配后，不会走到匹配串的后边。还重新再匹配。
比如：
(?:(https?|ftp|file)@user
https匹配后如果和后面的@user不匹配. 重新回到开始取匹配其他的如ftp，file。而不是从https后面接着匹配去。

1，

(.*)\".*\\breg.*\\bex.*\"\\s*$

括号部分是获取子串，会保存到sm[1]

后面的子串如何获取？？？他在sm[1]的second里面。是个iterator，如何取出？用suffix取出。

#include <iostream>
#include <regex>
#include <string>
 
using namespace std;
 
int main() {
    const auto input = "Some people, when confronted with a problem, think \"I know, I'll use regular expressions.\""s;
    smatch sm;
 
    cout << input << endl;
 
    // If input ends in a quotation that contains a word that begins with "reg" and another word begining with "ex" then capture the preceeding portion of input
    if (regex_match(input, sm, regex("(.*)\".*\\breg.*\\bex.*\"\\s*$"))) {
        const auto capture = sm[1].str();
 
        cout << '\t' << capture << endl; // Outputs: "\tSome people, when confronted with a problem, think\n"
 
        // Search our capture for "a problem" or "# problems"
        if(regex_search(capture, sm, regex("(a|d+)\\s+problems?"))) {
            const auto count = sm[1] == "a"s ? 1 : stoi(sm[1]);
 
            cout << '\t' << count << (count > 1 ? " problems\n" : " problem\n"); // Outputs: "\t1 problem\n"
            cout << "Now they have " << count + 1 << " problems.\n"; // Ouputs: "Now they have 2 problems\n"
        }
    }
}

2，对剩下的子串不停查找

要查找的“子串”，写在regex里面。子串前面的是前缀，后面的是后缀。

#include <iostream>
#include <regex>
#include <string>
using namespace std;
 
int main() {
    auto input = "+1--12*123/+1234"s;
    smatch sm;
 
    if(regex_search(input, sm, regex{ "(?:^|\\b\\W)([+-]?\\d+)" })) {
 
        do {
            cout << sm[1] << endl;
            input = sm.suffix().str();
        } while(regex_search(input, sm, regex{ "(?:^\\W|\\b\\W)([+-]?\\d+)" }));
    }
}

输出
+1
-12
123
+1234

3, (<a.*?</a>) 如果没有问号，是贪婪greedy最大化匹配。如 <a href=...></a><a href=..>xx</a>.整个子串将被匹配。

但是加上？后，只会会匹配第一个<a href=...>

4，token用来切开split文本，切去的不要。切好后用iterator接收。

对于：
start < a HREF ="http://cppreference.com">cppreference</a>\n</p>

其中：

start < a HREF ="http://cppreference.com">cppreference</a>\n</p> 提取部分 最后参数时1时取值

start < a HREF ="http://cppreference.com">cppreference</a>\n</p> 忽略部分

start < a HREF ="http://cppreference.com">cppreference</a>\n</p>  剩余部分 最后参数时-1时取值

#include <fstream>
#include <iostream>
#include <algorithm>
#include <iterator>
#include <regex>
 
int main()
{
    // Tokenization (non-matched fragments)
    // Note that regex is matched only two times; when the third value is obtained
    // the iterator is a suffix iterator.
    const std::string text = "Quick brown fox.";
    const std::regex ws_re("\\s+"); // whitespace
    std::copy( std::sregex_token_iterator(text.begin(), text.end(), ws_re, -1),
               std::sregex_token_iterator(),
               std::ostream_iterator<std::string>(std::cout, "\n"));
 
    std::cout << '\n';
 
    // Iterating the first submatches
    const std::string html = R"(<p><a href="http://google.com">google</a> )"
                             R"(start < a HREF ="http://cppreference.com">cppreference</a>\n</p>)";
    const std::regex url_re(R"!!(<\s*A\s+[^>]*href\s*=\s*"([^"]*)")!!", std::regex::icase);
    std::copy( std::sregex_token_iterator(html.begin(), html.end(), url_re, -1), //1是取得匹配得值；-1是匹配部分后面的值，直到下一个匹配。即后缀
               std::sregex_token_iterator(),
               std::ostream_iterator<std::string>(std::cout, "\n"));
}

输出：

Quick

brown

fox.

<p>

>google</a> 

>cppreference</a>\n</p>

改成1时输出：

Quick
brown
fox.

http://google.com
http://cppreference.com

split:

std::vector<std::string> split(const std::string &str, std::string regex)
{
 std::regex r{ regex };
 std::sregex_token_iterator start{ str.begin(), str.end(), r, -1 }, end;
 return std::vector<std::string>(start, end);
}
split("Some string\t with whitespace ", "\\s+"); // "Some", "string", "with", "whitespace

5,查找用

color_match.size 和

color_match[i]
取查找到的结果。结果从括号即截获分组中取出。

#include <iostream>
#include <string>
#include <regex>
 
int main()
{
    std::string lines[] = {"Roses are #ff0000",
                           "violets are #0000ff",
                           "all of my base are belong to you"};
 
    std::regex color_regex("#([a-f0-9]{2})"
                            "([a-f0-9]{2})"
                            "([a-f0-9]{2})");
 
    // simple match
    for (const auto &line : lines) {
        std::cout << line << ": " << std::boolalpha
                  << std::regex_search(line, color_regex) << '\n';
    }   
    std::cout << '\n';
 
    // show contents of marked subexpressions within each match
    std::smatch color_match;
    for (const auto& line : lines) {
        if(std::regex_search(line, color_match, color_regex)) {
            std::cout << "matches for '" << line << "'\n";
            std::cout << "Prefix: '" << color_match.prefix() << "'\n";
            for (size_t i = 0; i < color_match.size(); ++i) 
                std::cout << i << ": " << color_match[i] << '\n';
            std::cout << "Suffix: '" << color_match.suffix() << "\'\n\n";
        }
    }
 
    // repeated search (see also std::regex_iterator) 循环重复查找！！！
    std::string log(R"(
        Speed:    366
        Mass:    35
        Speed:    378
        Mass:    32
        Speed:    400
    Mass:    30)");
    std::regex r(R"(Speed:\t\d*)");
    std::smatch sm;
    while(regex_search(log, sm, r))
    {
        std::cout << sm.str() << '\n';
        log = sm.suffix();
    }
 
    // C-style string demo
    std::cmatch cm;
    if(std::regex_search("this is a test", cm, std::regex("test"))) 
        std::cout << "\nFound " << cm[0] << " at position " << cm.prefix().length();
}

输出
Roses are #ff0000: true
violets are #0000ff: true
all of my base are belong to you: false

matches for 'Roses are #ff0000'
Prefix: 'Roses are '
0: #ff0000
1: ff
2: 00
3: 00
Suffix: ''

matches for 'violets are #0000ff'
Prefix: 'violets are '
0: #0000ff
1: 00
2: 00
3: ff
Suffix: ''

Speed:    366
Speed:    378
Speed:    400

Found test at position 10

参考：https://en.cppreference.com/w/cpp/regex/regex_search

6，抓取到转移字符的。,是分隔符，“\,” 就不是分隔符。

#include <algorithm>
#include <iostream>
#include <iterator>
#include <regex>
#include <string>
#include <vector>
 
using namespace std;
 
int main() {
    const auto input = "please split,this,csv, ,line,\\,\n"s;
    const regex re{ "((?:[^\\\\,]|\\\\.)+)(?:,|$)" };
    const vector<string> m_vecFields{ sregex_token_iterator(cbegin(input), cend(input), re, 1), sregex_token_iterator() };
 
    cout << input << endl;
 
    copy(cbegin(m_vecFields), cend(m_vecFields), ostream_iterator<string>(cout, "\n"));//拷贝到输出
}

C++11：新式的字符串字面常量（String Literal）

auto = R"( ....这里是任意的字符包括换行引号等，可以把一本书的内容放这里。除了不能出现 )"... )"

R"( 是开头；)"是结尾。

如果里面包含)", 我们可以在)"中间插入任意组合字符来规避。开始结尾子串变为 R"xxximpossible appear string xxxx( lalalalalal )xxximpossible appear string xxxx"

http://www.vishalchovatiya.com/regex-c/

Reading Time: 8 minutes

Regular expressions (or regex in short) is a much-hated & underrated topic so far with Modern C++. But at the same time, correct use of regex can spare you writing many lines of code. If you have spent quite enough time in the industry. And not knowing regex then you are missing out on 20-30% productivity. In that case, I highly recommend you to learn regex, as it is one-time investment(something similar to learn once, write anywhere philosophy).

Initially, In this article, I have decided to include regex-in-general also. But it doesn’t make sense, as there is already people/tutorial out there who does better than me in teaching regex. But still, I left a small section to address Motivation & Learning Regex. For the rest of the article, I will be focusing on functionality provided by C++ to work with regex. And if you are already aware of regex, you can use the above mind-map as a refresher.

Pointer: The C++ standard library offers several different “flavours” of regex syntax, but the default flavour (the one you should always use & I am demonstrating here) was borrowed wholesale from the standard for ECMAScript.

Contents [hide]

Motivation

I know its pathetic and somewhat confusing tool-set. Consider the below regex pattern for an example that extract time in 24-hour format i.e. HH:MM.

\b([01]?[0-9]|2[0-3]):([0-5]\d)\b

I mean! Who wants to work with this cryptic text?
And whatever running in your mind is 100% reasonable. In fact, I have procrastinated learning regex twice due to the same reason. But, believe me, all the ugly looking things are not that bad.
The way(↓) I am describing here won’t take more than 2-3 hours to learn regex that too intuitively. And After learning it you will see the compounding effect with return on investment over-the-time.

Learning Regex

Do not google much & try to analyse which tutorial is best. In fact, don’t waste time in such analysis. Because there is no point in doing so. At this point in time(well! if you don’t know the regex) what really matters is “Getting Started” rather than “What Is Best!”.
Just go to https://regexone.com without much overthinking. And complete all the lessons. Trust me here, I have explored many articles, courses(<=this one is free, BTW) & books. But this is best among all for getting started without losing motivation.
And after it, if you still have an appetite to solve more problem & exercises. Consider the below links:
1. Exercises on regextutorials.com
2. Practice problem on regex by hackerrank

std::regex & std::regex_error Example

int main() {
try {
static const auto r = std::regex(R"(\)"); // Escape sequence error
} catch (const std::regex_error &e) {
assert(strcmp(e.what(), "Unexpected end of regex when escaping.") == 0);
assert(e.code() == std::regex_constants::error_escape);
}
return EXIT_SUCCESS;
}

You see! I am using raw string literals. You can also use the normal string. But, in that case, you have to use a double backslash for an escape sequence.
The current implementation of std::regex is slow(as it needs regex interpretation & data structure creation at runtime), bloated and unavoidably require heap allocation(not allocator-aware). So, beware if you are using std::regex in a loop(see C++ Weekly – Ep 74 – std::regex optimize by Jason Turner). Also, there is only a single member function that I think could be of use is std::regex::mark_count() which returns a number of capture groups.
Moreover, if you are using multiple strings to create a regex pattern at run time. Then you may need exception handling i.e. std::regex_error to validate its correctness.

std::regex_search Example

int main() {
const string input = "ABC:1-> PQR:2;;; XYZ:3<<<"s;
const regex r(R"((\w+):(\w+);)");
smatch m;
 
if (regex_search(input, m, r)) {
assert(m.size() == 3);
assert(m[0].str() == "PQR:2;"); // Entire match 0是最长匹配到的内容
assert(m[1].str() == "PQR"); // Substring that matches 1st group 第一个group
assert(m[2].str() == "2"); // Substring that matches 2nd group
assert(m.prefix().str() == "ABC:1-> "); // All before 1st character match
assert(m.suffix().str() == ";; XYZ:3<<<"); // All after last character match
 
// for (string &&str : m) { // Alternatively. You can also do
// cout << str << endl;
// }
}
return EXIT_SUCCESS;
}

smatch is the specializations of std::match_results that stores the information about matches to be retrieved.

std::regex_match Example

Short & sweet example that you may always find in every regex book is email validation. And that is where our std::regex_match function fits perfectly.

bool is_valid_email_id(string_view str) {
static const regex r(R"(\w+@\w+\.(?:com|in))");
return regex_match(str.data(), r);
}
 
int main() {
assert(is_valid_email_id("vishalchovatiya@ymail.com") == true);
assert(is_valid_email_id("@abc.com") == false);
return EXIT_SUCCESS;
}

I know this is not full proof email validator regex pattern. But my intention is also not that.
Rather you should wonder why I have used std::regex_match! not std::regex_search! The rationale is simple std::regex_match matches the whole input sequence.
Also, Noticeable thing is static regex object to avoid constructing (“compiling/interpreting”) a new regex object every time the function entered.
The irony of above tiny code snippet is that it produces around 30k lines of assembly that too with -O3 flag. And that is ridiculous. But don’t worry this is already been brought to the ISO C++ community. And soon we may get some updates. Meanwhile, we do have other alternatives (mentioned at the end of this article).

Difference Between std::regex_match & std::regex_search?全部匹配match vs 子串匹配 search

You might be wondering why do we have two functions doing almost the same work? Even I had the doubt initially. But, after reading the description provided by cppreference over and over. I found the answer. And to explain that answer, I have created the example(obviously with the help of StackOverflow):

int main() {
const string input = "ABC:1-> PQR:2;;; XYZ:3<<<"s;
const regex r(R"((\w+):(\w+);)");
smatch m;
 
assert(regex_match(input, m, r) == false);
 
assert(regex_search(input, m, r) == true && m.ready() == true && m[1] == "PQR");
 
return EXIT_SUCCESS;
}

std::regex_match only returns true when the entire input sequence has been matched, while std::regex_search will succeed even if only a sub-sequence matches the regex.

std::regex_iterator Example 详细匹配信息

std::regex_iterator is helpful when you need very detailed information about matches & sub-matches.

#define C_ALL(X) cbegin(X), cend(X)
 
int main() {
const string input = "ABC:1-> PQR:2;;; XYZ:3<<<"s;
const regex r(R"((\w+):(\d))");
 
const vector<smatch> matches{
sregex_iterator{C_ALL(input), r},
sregex_iterator{}
};
 
assert(matches[0].str(0) == "ABC:1"
&& matches[0].str(1) == "ABC"
&& matches[0].str(2) == "1");
 
assert(matches[1].str(0) == "PQR:2"
&& matches[1].str(1) == "PQR"
&& matches[1].str(2) == "2");
 
assert(matches[2].str(0) == "XYZ:3"
&& matches[2].str(1) == "XYZ"
&& matches[2].str(2) == "3");
 
return EXIT_SUCCESS;
}

Earlier(in C++11), there was a limitation that using std::regex_interator is not allowed to be called with a temporary regex object. Which has been rectified with overload from C++14.

std::regex_token_iterator Example

std::regex_token_iterator is the utility you are going to use 80% of the time. It has a slight variation as compared to std::regex_iterator. The difference between std::regex_iterator & std::regex_token_iterator is
- std::regex_iterator points to match results.
- std::regex_token_iterator points to sub-matches.
In std::regex_token_iterator, each iterator contains only a single matched result. 把匹配子串拆开，用参数0，1，2分别去取。

#define C_ALL(X) cbegin(X), cend(X)
 
int main() {
const string input = "ABC:1-> PQR:2;;; XYZ:3<<<"s;
const regex r(R"((\w+):(\d))");
 
// Note: vector<string> here, unlike vector<smatch> as in std::regex_iterator
const vector<string> full_match{
sregex_token_iterator{C_ALL(input), r, 0}, // Mark `0` here i.e. whole regex match
sregex_token_iterator{}
};
assert((full_match == decltype(full_match){"ABC:1", "PQR:2", "XYZ:3"}));
 
const vector<string> cptr_grp_1st{
sregex_token_iterator{C_ALL(input), r, 1}, // Mark `1` here i.e. 1st capture group
sregex_token_iterator{}
};
assert((cptr_grp_1st == decltype(cptr_grp_1st){"ABC", "PQR", "XYZ"}));
 
const vector<string> cptr_grp_2nd{
sregex_token_iterator{C_ALL(input), r, 2}, // Mark `2` here i.e. 2nd capture group
sregex_token_iterator{}
};
assert((cptr_grp_2nd == decltype(cptr_grp_2nd){"1", "2", "3"}));
 
return EXIT_SUCCESS;
}

Inverted Match With std::regex_token_iterator -1去取不匹配的

#define C_ALL(X) cbegin(X), cend(X)
 
int main() {
const string input = "ABC:1-> PQR:2;;; XYZ:3<<<"s;
const regex r(R"((\w+):(\d))");
 
const vector<string> inverted{
sregex_token_iterator{C_ALL(input), r, -1}, // `-1` = parts that are not matched
sregex_token_iterator{}
};
assert((inverted == decltype(inverted){
"",
"-> ",
";;; ",
"<<<",
}));
 
return EXIT_SUCCESS;
}

std::regex_replace Example

string transform_pair(string_view text, regex_constants::match_flag_type f = {}) {
static const auto r = regex(R"((\w+):(\d))");
return regex_replace(text.data(), r, "$2", f);
}
 
int main() {
assert(transform_pair("ABC:1, PQR:2"s) == "1, 2"s);
 
// Things that aren't matched are not copied
assert(transform_pair("ABC:1, PQR:2"s, regex_constants::format_no_copy) == "12"s);
return EXIT_SUCCESS;
}

You see in 2nd call of transform_pair, we passed flag std::regex_constants::format_no_copy which suggest do not copy thing that isn’t matched. There are many such useful flags under std::regex_constant.
Also, we have constructed the fresh string holding the results. But what if we do not want a new string. Rather wants to append the results directly to somewhere(probably container or stream or already existing string). Guess what! the standard library has covered this also with overloaded std::regex_replace as follows: 追加到已存在的stream或者string中：

int main() {
const string input = "ABC:1-> PQR:2;;; XYZ:3<<<"s;
const regex r(R"(-|>|<|;| )");
 
// Prints "ABC:1 PQR:2 XYZ:3 "
regex_replace(ostreambuf_iterator<char>(cout), C_ALL(input), r, " ");
 
return EXIT_SUCCESS;
}

输出成string示例：

std::stringstream ss;
regex_replace(ostreambuf_iterator<char>(ss), C_ALL(input), r, "*");
    cout << ss.str();

Use Cases

Splitting a String With Delimiter 拆字符串

Although std::strtok is best suitable & optimal candidate for such a task. But just to demonstrate how you can do it with regex:

#define C_ALL(X) cbegin(X), cend(X)
 
vector<string> split(const string& str, string_view pattern) {
const auto r = regex(pattern.data());
return vector<string>{
sregex_token_iterator(C_ALL(str), r, -1),
sregex_token_iterator()
};
}
 
int main() {
assert((split("/root/home/vishal", "/")
== vector<string>{"", "root", "home", "vishal"}));
return EXIT_SUCCESS;
}

View Code

Trim Whitespace From a String 去空格

string trim(string_view text) {
static const auto r = regex(R"(\s+)");
return regex_replace(text.data(), r, "");
}
 
int main() {
assert(trim("12 3 4 5"s) == "12345"s);
return EXIT_SUCCESS;
}

View Code

Finding Lines Containing or Not Containing Certain Words From a File 搜文件内容匹配

string join(const vector<string>& words, const string& delimiter) {
return accumulate(next(begin(words)), end(words), words[0],
[&delimiter](string& p, const string& word)
{
return p + delimiter + word;
});
}
 
vector<string> lines_containing(const string& file, const vector<string>& words) {
auto prefix = "^.*?\\b("s;
auto suffix = ")\\b.*$"s;
 
// ^.*?\b(one|two|three)\b.*$
const auto pattern = move(prefix) + join(words, "|") + move(suffix);
 
ifstream infile(file);
vector<string> result;
 
for (string line; getline(infile, line);) {
if(regex_match(line, regex(pattern))) {
result.emplace_back(move(line));
}
}
 
return result;
}
 
int main() {
assert((lines_containing("test.txt", {"one","two"})
== vector<string>{"This is one",
"This is two"}));
return EXIT_SUCCESS;
}
/* test.txt
This is one
This is two
This is three
This is four
*/

View Code

Same goes for finding lines that are not containing words with the pattern ^((?!(one|two|three)).)*$.

Finding Files in a Directory 搜目录下的文件

#include <string>
#include <vector>

#include <string_view>
#include <filesystem>
#include <regex>

namespace fs = std::filesystem;
using namespace std;
vector<fs::directory_entry> find_files(const fs::path& path, string_view rg) {
    vector<fs::directory_entry> result;
    regex r(rg.data());
    copy_if(
        fs::recursive_directory_iterator(path),
        fs::recursive_directory_iterator(),
        back_inserter(result),
        [&r](const fs::directory_entry& entry) {
            return fs::is_regular_file(entry.path()) &&
                regex_match(entry.path().filename().string(), r);
        });
    return result;
}

int main() {
    const auto dir = fs::temp_directory_path();
    const auto pattern = R"(\w+\.png)";
    const auto result = find_files(fs::current_path(), pattern);
    for (const auto& entry : result) {
        cout << entry.path().string() << endl;
    }
    return EXIT_SUCCESS;
}

Tips For Using Regex-In-General

Use raw string literal for describing the regex pattern in C++.
Use the regex validating tool like https://regex101.com. What I like about regex101 is code generation & time-taken(will be helpful when optimizing regex) feature.
Also, try to add generated explanation from validation tool as a comment exactly above the regex pattern in your code.
Performance:
- If you are using alternation, try to arrange options in high probability order like com|net|org.
- Try to use lazy quantifiers if possible.
- Use non-capture groups wherever possible.
- Disable Backtracking.
- Using the negated character class is more efficient than using a lazy dot.

Parting Words

It’s not just that you will use regex with only C++ or any other language. I myself use it mostly on IDE(in vscode to analyse log files) & on Linux terminal. But, bear in mind that overusing regex gives the feel of cleverness. And, it’s a great way to make your co-workers (and anyone else who needs to work with your code) very angry with you. Also, regex is overkill for most parsing tasks that you’ll face in your daily work.

The regexes really shine for complicated tasks where hand-written parsing code would be just as slow anyway; and for extremely simple tasks where the readability and robustness of regular expressions outweigh their performance costs.

One more notable thing is current regex implementation(till 19th June 2020) in standard libraries have performance & code bloating issues. So choose wisely between Boost, CTRE and Standard library versions. Most probably you might go with the Hana Dusíková’s work on Compile Time Regular Expression. Also, her CppCon talk from 2018 & 2019‘s would be helpful especially if you plan to use regex in embedded systems.

Do you like it☝️? Get such articles directly into the inbox…!📥

url 正则

https://zapier.com/blog/extract-links-email-phone-regex/

https://daringfireball.net/2010/07/improved_regex_for_matching_urls?utm_source=zapier.com&utm_medium=referral&utm_campaign=zapier

https://gist.github.com/gruber/249502

url识别类型用例：

Test data for the URL-matching regex pattern presented here:

http://daringfireball.net/2010/07/improved_regex_for_matching_urls


Matches the right thing in the following lines:

    http://foo.com/blah_blah
    http://foo.com/blah_blah/
    (Something like http://foo.com/blah_blah)
    http://foo.com/blah_blah_(wikipedia)
    http://foo.com/more_(than)_one_(parens)
    (Something like http://foo.com/blah_blah_(wikipedia))
    http://foo.com/blah_(wikipedia)#cite-1
    http://foo.com/blah_(wikipedia)_blah#cite-1
    http://foo.com/unicode_(✪)_in_parens
    http://foo.com/(something)?after=parens
    http://foo.com/blah_blah.
    http://foo.com/blah_blah/.
    <http://foo.com/blah_blah>
    <http://foo.com/blah_blah/>
    http://foo.com/blah_blah,
    http://www.extinguishedscholar.com/wpglob/?p=364.
    http://✪df.ws/1234
    rdar://1234
    rdar:/1234
    x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E
    message://%3c330e7f840905021726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e
    http://➡.ws/䨹
    www.c.ws/䨹
    <tag>http://example.com</tag>
    Just a www.example.com link.
    http://example.com/something?with,commas,in,url, but not at end
    What about <mailto:gruber@daringfireball.net?subject=TEST> (including brokets).
    mailto:name@example.com
    bit.ly/foo
    “is.gd/foo/”
    WWW.EXAMPLE.COM
    http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752
    http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))
    http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634))

    
Should fail against:
    6:00p
    filename.txt

    
Known to fail against:
    http://example.com/quotes-are-“part”
    ✪df.ws/1234
    example.com
    example.com/

View Code

将html网页中的网址识别出来，追加link标签。

比如 <a href=xxx> www.baidu.com</a> www.baidu.com

运行后变成：<a href=xxx> www.baidu.com</a> <a href=xxx> www.baidu.com</a>

#include "info_extractor.h"

#include <iostream>
#include <regex>
#include <string>
#include <vector>

using namespace std;

namespace mypackage {

static const auto REPLACE_STR = "<a href=\"$2$1\">$1</a>"s;
static const auto EMAIL_PATTERN =
    R"(\b((?:(mailto:\/\/)?(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]))))";
static const auto URL_PATTERN =
    R"(\b((?:(https?|ftp|file|callto|tel|mms|gopher|news):\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])))";
static const auto ANCHOR_START = "<a "s;
static const auto SCRIPT_START = "<script "s;
static const auto STYLE_START = "<style "s;
static const auto ANCHOR_FULL_MATCH =
    R"!!(((<\s*a [\w\s\W\S]*?<\s*/a\s*>)|(<script [\w\s\W\S]*?</script>)|(<style [\w\s\W\S]*?</style>)))!!";
static const auto ANCHOR_START_MATCH = "(<\\s*a [\\w\\s\\W\\S]*)"s;
static const auto TAG_END = "[\\w\\s]*(>|<)"s;

//#define DEBUG

void printWillParse(std::string& input) {
#ifdef DEBUG
  cout << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;" << endl;
  cout << "                          parse:" << input << endl;
  cout << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;" << endl;
#endif
}

void processItem(std::string&& s, vector<string>& split_vector) {
#ifdef DEBUG
  cout << "\n---------------------------------;" << endl;
  cout << "===item===" << s << endl;
  cout << "---------------------------------;" << endl;
#endif
  split_vector.emplace_back(s);
}
bool splitAnchor(std::string& input, std::vector<std::string>& split_vector) {
  smatch sm;

  const std::regex e(ANCHOR_FULL_MATCH,
                     std::regex::icase);  //| std::regex::multiline ECMAScript

  if (regex_search(input, sm, regex{e})) {
    do {
      if (sm.prefix().matched) {
        processItem(sm.prefix().str(), split_vector);
      }
      processItem(sm[1].str(), split_vector);

      input = sm.suffix().str();
      // printWillParse(input);//test
    } while (regex_search(input, sm, regex{e}));
  }

  // process the last one
  if (regex_search(input, sm, regex{ANCHOR_START_MATCH})) {
    if (sm.prefix().length() > 0) {
      processItem(sm.prefix().str(), split_vector);
    }
    processItem(sm[1].str() + sm.suffix().str(), split_vector);
  } else if (input.length() > 0) {
    processItem(move(input), split_vector);
  }

  return true;
}

bool hasEndTag(string&& s) {
  smatch sm;
  if (regex_search(s, sm, regex{TAG_END})) {
    if (sm[1].str() == ">")
      return true;
  }
  return false;
}

void printMatch(const smatch& m) {
  for (size_t i = 0; i < m.size(); ++i) {
    std::ssub_match sub_match = m[i];
    std::string piece = sub_match.str();
    std::cout << "  submatch " << i << ": " << piece << '\n';
  }
}

string replaceByPattern(string& s,
                        const string& pattern,
                        const string& protocol) {
  smatch sm;
  string ss = "";
  int index = 0;
  if (regex_search(s, sm, regex{pattern})) {
    do {
      if (sm.prefix().matched) {
        ss += sm.prefix().str();
      }
      if (sm[index].matched) {
        if (hasEndTag(sm.suffix().str())) {
          ss += sm[index].str();
        } else {
          ss += "<a href=\"";
          if (sm.size() < 3 || !sm[2].matched) {
            ss += protocol;
          }
          ss += sm[index].str();
          ss += "\">" + sm[index].str() + "</a>";
        }
      }

      // printMatch(sm);//test

      s = sm.suffix().str();
    } while (regex_search(s, sm, regex{pattern}));
  }
  ss += s;

  return ss;
}

string replaceUrl(string& s) {
  return replaceByPattern(s, URL_PATTERN, "http://");
}

string replaceEmail(string&& s) {
  return replaceByPattern(s, EMAIL_PATTERN, "mailto://");
}

std::vector<std::string>& extractUrl(string&& s, std::vector<std::string>& v) {
  smatch sm;
  string ss = "";
  if (regex_search(s, sm, regex{URL_PATTERN})) {
    do {
      if (sm[1].matched) {
        ss += sm[1].str();
      }
      v.emplace_back(ss);
      ss = "";
      s = sm.suffix().str();
    } while (regex_search(s, sm, regex{URL_PATTERN}));
  }
  return v;
}

string url_extractor(std::string& input) {
  vector<string> v;
  splitAnchor(input, v);
  string result = {};
  for (string& ss : v) {
    if (ss.compare(0, 3, ANCHOR_START) != 0 &&
        ss.compare(0, 7, STYLE_START) != 0 &&
        ss.compare(0, 8, SCRIPT_START) != 0)
      result += replaceEmail(replaceUrl(ss));
    else
      result += ss;
  }
  return result;
}
}  // namespace tistar

View Code

posted @ 2020-11-09 14:15 Bigben 阅读(1980) 评论(0) 编辑收藏举报

刷新页面返回顶部