Eloquent JavaScript #09# Regular Expressions

Notes

1、正则表达式帮助我们在字符串中寻找特定模式。

js创建正则表达式的两种等价写法:

let re1 = new RegExp("abc");
let re2 = /abc/;

 

2、应用正则表达式

console.log(/abc/.test("abcde"));
// → true
console.log(/abc/.test("abxde"));
// → false

 

3、字符集合

\d Any digit character
\w An alphanumeric character (“word character”)
\s Any whitespace character (space, tab, newline, and similar)
\D A character that is not a digit
\W A nonalphanumeric character
\S A nonwhitespace character
. Any character except for newline
/abc/ A sequence of characters
/[abc]/ Any character from a set of characters
/[^abc]/ Any character not in a set of characters
/[0-9]/ Any character in a range of characters
/x+/ One or more occurrences of the pattern x
/x+?/ One or more occurrences, nongreedy
/x*/ Zero or more occurrences
/x?/ Zero or one occurrence
/x{2,4}/ Two to four occurrences
/(abc)/ A group
/a|b|c/ Any one of several patterns
/\d/ Any digit character
/\w/ An alphanumeric character (“word character”)
/\s/ Any whitespace character
/./ Any character except newlines
/\b/ A word boundary
/^/ Start of input
/$/ End of input

\d等转移字符可以放在 [ ] 里而不丧失含义,但是 . 和+ 之类的特殊符号不行,会变为普通的符号。

整体取反,非0非1:

let notBinary = /[^01]/;
console.log(notBinary.test("1100100010100110"));
// → false
console.log(notBinary.test("1100100010200110"));
// → true

 

4、重复匹配

+ one or more,* zero or more

console.log(/'\d+'/.test("'123'"));
// → true
console.log(/'\d+'/.test("''"));
// → false
console.log(/'\d*'/.test("'123'"));
// → true
console.log(/'\d*'/.test("''"));
// → true

? zero or one

let neighbor = /neighbou?r/;
console.log(neighbor.test("neighbour"));
// → true
console.log(neighbor.test("neighbor"));
// → true

{2} a pattern should occur a precise number of times,It is also possible to specify a range this way: {2,4} means the element must occur at least twice and at most four times.

let dateTime = /\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{2}/;
console.log(dateTime.test("1-30-2003 8:45"));
// → true

 You can also specify open-ended ranges when using braces by omitting the number after the comma. So, {5,} means five or more times.

 

5、分组(子表达式)

括号内的n个元素被视作一个整体元素(分组,子表达式):

let cartoonCrying = /boo+(hoo+)+/i;
console.log(cartoonCrying.test("Boohoooohoohooo"));
// → true

i表示该表达式大小写不敏感。

 

6、进行正则匹配的另外一种方式

可以让我们获取额外的信息:

let match = /\d+/.exec("one two 100");
console.log(match);
// → ["100"]
console.log(match.index);
// → 8

exec的返回值:匹配失败为null,成功则如上所示。

等价写法:

console.log("one two 100".match(/\d+/));
// → ["100"]

含括号表达式的情况:

let quotedText = /'([^']*)'/;
console.log(quotedText.exec("she said 'hello'"));
// → ["'hello'", "hello"]

console.log(/bad(ly)?/.exec("bad"));
// → ["bad", undefined]
console.log(/(\d)+/.exec("123"));
// → ["123", "3"]

返回数组的第一个元素为整个正则表达式匹配的字符串,而第二元素为() 内正则(子表达式)匹配的字符串(没有就是undefined,多个就取最后一个)。容易知道,第二个元素几乎总是第一个元素的子集。

 

7、The Date class

console.log(new Date());
// → Sat Sep 01 2018 13:54:43 GMT+0800 (中国标准时间)

console.log(new Date(2009, 11, 9));
// → Wed Dec 09 2009 00:00:00 GMT+0800 (中国标准时间)
console.log(new Date(2009, 11, 9, 12, 59, 59, 999));
// → Wed Dec 09 2009 12:59:59 GMT+0800 (中国标准时间)

console.log(new Date(1997, 10, 19).getTime());
// → 879868800000
console.log(new Date(1387407600000));
// → Thu Dec 19 2013 07:00:00 GMT+0800 (中国标准时间)

console.log(new Date().getTime());
// → 1535781283593
console.log(Date.now());
// → 1535781283593

通过正则表达式,由String创建日期:

"use strict";

function getDate(string) {
  let [_, month, day, year] =
    /(\d{1,2})-(\d{1,2})-(\d{4})/.exec(string);
  return new Date(year, month - 1, day);
}
console.log(getDate("1-30-2003"));
// → Thu Jan 30 2003 00:00:00 GMT+0100 (CET)

PS. 下划线除了用来占位外没有其它含义。

 

8、强制匹配整个字符串

利用 ^ 和 $ 。例如/^\d+$/匹配完全由数字构成的字符串,/^!/ 匹配由!开头的字符串,而/x^/ 啥也匹配不了。

用 \b 标注单词边界:

console.log(/cat/.test("concatenate"));
// → true
console.log(/\bcat\b/.test("concatenate"));
// → false
console.log(/\bcat\b/.test("xx cat xx"));
// → true

 

9、Choice patterns

let animalCount = /\b\d+ (pig|cow|chicken)s?\b/;
console.log(animalCount.test("15 pigs"));
// → true
console.log(animalCount.test("15 pigchickens"));
// → false

 

10、正则匹配的机制

当你进行正则匹配时(test或者exec),正则引擎将从所给字符串的开头开始尝试匹配,接着是第二个字符,第三个字符... 试图在所给字符串中寻找一个匹配,直到找到一个匹配项或者到达字符串末尾结束。要么返回第一个匹配,要么什么都匹配不到。

/**
 * 模拟用正则\b\d+ (pig|cow|chicken)s?\b
 * 匹配"the 3 pigs"
 */

const str = "the 3 pigs";

function simulateRegex(str, start) {
    const digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
    // 逐个字符尝试匹配,直到找到一个匹配项或者到达字符串尾结束
    for(let currentPosition = start; currentPosition != str.length; ++currentPosition) {
        let tempPosition = currentPosition;
        if(tempPosition == 0 || str[tempPosition - 1] == " ") {} else continue;
        // 匹配单词边界通过,前面是标点也是可以的。。            
        if(!str[tempPosition++] in digits) continue;
        // 至少匹配一个数字通过
        while(str[tempPosition] in digits) {
            tempPosition++;
            // 尝试继续匹配数字
        }
        if(str[tempPosition++] != " ") continue;
        // 匹配一个空格通过
        let tempWord;
        if(str.slice(tempPosition, tempPosition + "pig".length) === (tempWord = "pig") ||
            str.slice(tempPosition, tempPosition + "cow".length) === (tempWord = "cow") ||
            str.slice(tempPosition, tempPosition + "chicken".length) === (tempWord = "chicken")) {
            tempPosition += tempWord.length;
        } else {
            continue;
        }
        // 单词匹配成功
        if(str[tempPosition] == "s") tempPosition++;
        // 有没s都可以
        if(tempPosition == str.length || str[tempPosition] == " ") {
            // 最后的单词边界
            let match = [str.slice(currentPosition, tempPosition + 1)];
            return match;
        }
    }
    return null;
}

let match = simulateRegex(str, 4);
console.log(match);
// → ["3 pigs"]

 

11、回溯Backtracking

正则引擎在进行分支匹配(|)或重复匹配(+ *)时,如果发现无法继续再继续往下匹配,就会进行“回溯”。

在进行分支匹配时,如果第一个分支就匹配成功,就不再匹配其它分支,如果不成功就会回溯到分支的入口,进入到另外一个分支继续匹配。

而进行重复匹配时,例如说/^.*x/用匹配"abcxe",.*会首先把所有字符消费干净,当正则引擎发现最后还需要一个x时,*操作符会尝试少匹配一个字符,但是仍然没发现x,于是继续回溯,直到发现x,最终得到字符串abc。

 

12、The replace method

replace配合正则:

console.log("papa".replace("p", "m"));
// → mapa

console.log("Borobudur".replace(/[ou]/, "a"));
// → Barobudur
console.log("Borobudur".replace(/[ou]/g, "a")); // g代表global全部
// → Barabadar

replace的真正强大之处在于可以用“$数字”引用匹配字符串:

console.log(
  "Liskov, Barbara\nMcCarthy, John\nWadler, Philip"
    .replace(/(\w+), (\w+)/g, "$2 $1"));
// → Barbara Liskov
//   John McCarthy
//   Philip Wadler


"hello, word, every, one".replace(/(\w+),/g, "$1 "); // “$+数字”引用匹配中的分组
// → "hello  word  every  one"
"hello, word, every, one".replace(/one/g, "$& $&"); // “$&”引用整个匹配
// → "hello, word, every, one one"

还可以传入函数:

"hello, word, every, one".replace(/(\w+),/g, str => str.toUpperCase()); 
// → "HELLO, WORD, EVERY, one"
let stock = "1 lemon, 2 cabbages, and 101 eggs";
function minusOne(match, amount, unit) {
  amount = Number(amount) - 1;
  if (amount == 1) { // only one left, remove the 's'
    unit = unit.slice(0, unit.length - 1);
  } else if (amount == 0) {
    amount = "no";
  }
  return amount + " " + unit;
}
console.log(stock.replace(/(\d+) (\w+)/g, minusOne));
// → no lemon, 1 cabbage, and 100 eggs

 

13、贪婪Greed

function stripComments(code) {
  return code.replace(/\/\/.*|\/\*[^]*\*\//g, "");
}
console.log(stripComments("1 + /* 2 */3"));
// → 1 + 3
console.log(stripComments("x = 10;// ten!"));
// → x = 10;
console.log(stripComments("1 /* a */+/* b */ 1"));
// → 1  1

可以用replace来去掉代码中的所有注释。

[^]可以匹配任何字符,因为 /**/可能跨多行,句点 . 无法匹配换行符号。

然而上面最后一行代码结果却出错了,这是为什么呢?

因为(+*?, and {}) 这些操作符号都是贪婪的,就像“回溯”里面提到的,它们总是先尽可能地消费字符,直到无路可走才会回头,这样理所当然会匹配到更长的那一个。解决方案就是在这些符号后面加问号 (+?*???{}?),这样它们就会匹配尽可能少的字符串。

function stripComments(code) {
  return code.replace(/\/\/.*|\/\*[^]*?\*\//g, "");
}
console.log(stripComments("1 /* a */+/* b */ 1"));
// → 1 + 1

当要用到重复匹配符时,先考虑用非贪婪版本的。

 

14、动态构建正则表达式

利用new RegExp(拼接字符串, "gi")构建,gi表示global替换全部和大小写不敏感。

let name = "harry";
let text = "Harry is a suspicious character.";
let regexp = new RegExp("\\b(" + name + ")\\b", "gi");
console.log(text.replace(regexp, "_$1_"));
// → _Harry_ is a suspicious character.

let name = "dea+hl[]rd";
let text = "This dea+hl[]rd guy is super annoying.";
let escaped = name.replace(/[\\[.+*?(){|^$]/g, "\\$&");
// escaped → "dea\+hl\[]rd"
let regexp = new RegExp("\\b" + escaped + "\\b", "gi");
console.log(text.replace(regexp, "_$&_"));
// → This _dea+hl[]rd_ guy is super annoying.

 

15、Search

正则版indexof:

console.log("  word".search(/\S/));
// → 2
console.log("    ".search(/\S/));
// → -1

 

16、The lastIndex property

需求:设置从字符串的某个字符开始匹配

问题:没有方便的办法

理由:不方便正是js的特性。。。。

解决方案:在【严格的条件】下用lastIndex设定起始位置

严格的条件:表达式必须开启g(global)或者s(sticky)选项,并且必须通过exec方式执行匹配。

lastIndex:正则对象的一个属性,数字,决定了下一个匹配从第几个字符开始。在严格条件 ↑ 下设定才有效。非严格条件下改变该值是毫无作用的。

let pattern = /y/g;
pattern.lastIndex = 3;
let match = pattern.exec("xyzzy");
console.log(match.index);
// → 4
console.log(pattern.lastIndex);
// → 5

仅global:匹配成功,自动更新lastIndex为匹配成功位置的下一个位置(如上),匹配失败,lastIndex重新设置为0。

global:从str[lastIndex]开始向后搜索匹配

sticky:从str[lastIndex]直接开始匹配,不向后搜索。

let global = /abc/g;
console.log(global.exec("xyz abc"));
// → ["abc"]
let sticky = /abc/y;
console.log(sticky.exec("xyz abc"));
// → null

 所以只需简单调整一下lastIndex就可以让上面成功的失败、失败的成功:

let global = /abc/g;
global.lastIndex = 6; // 从c开始向后搜索匹配
console.log(global.exec("xyz abc"));
// → null
let sticky = /abc/y;
sticky.lastIndex = 4; // 从a开始匹配
console.log(sticky.exec("xyz abc"));
// → ["abc"]

因为在global启用时,LastIndex在匹配完之后是要自动更新的,所以,当用一个正则对象匹配多次的时候就会出现坑爹的结果:

let digit = /\d/g;
console.log(digit.exec("here it is: 1"));
// → ["1"]
console.log(digit.exec("and now: 1"));
// → null

在s启用,或者啥也不启用时不会有这方面的顾虑。

global的另外一方面影响在于,它改变了match的行为:

console.log("Banana".match(/an/g));
// → ["an", "an"]
console.log(/an/g.exec("Banana"));
// → ["an", index: 1, input: "Banana", groups: undefined] 
// global改变了match的行为,本来上述两个
// 输出应该相同的(等价操作),而且["an", "an"]
// 后者本应该是子表达式匹配的字符串,前者的子集

总结。。慎用global

 

17、遍历匹配项

利用global模式下的lastIndex机制应该是最简便的方法。

let input = "A string with 3 numbers in it... 42 and 88.";
let number = /\b\d+\b/g;
let match;
while (match = number.exec(input)) {
  console.log("Found", match[0], "at", match.index);
}
// → Found 3 at 14
//   Found 42 at 33
//   Found 88 at 40

 

18、解析INI文件

function parseINI(string) {
    // Start with an object to hold the top-level fields
    let result = {};
    let section = result;
    string.split(/\r?\n/).forEach(line => {
        let match;
        if(match = line.match(/^(\w+)=(.*)$/)) {
            section[match[1]] = match[2];
        } else if(match = line.match(/^\[(.*)\]$/)) {
            section = result[match[1]] = {};
        } else if(!/^\s*(;.*)?$/.test(line)) {
            throw new Error("Line '" + line + "' is not valid.");
        }
    });
    return result;
}

console.log(parseINI(`
searchengine=https://duckduckgo.com/?q=$1
spitefulness=9.7

; comments are preceded by a semicolon...
; each section concerns an individual enemy
[larry]
fullname=Larry Doe
type=kindergarten bully
website=http://www.geocities.com/CapeCanaveral/11451

[davaeorn]
fullname=Davaeorn
type=evil wizard
outputdir=/home/marijn/enemies/davaeorn`));
// → davaeorn:  { fullname: "Davaeorn", type: "evil wizard", outputdir: "/home/marijn/enemies/davaeorn" }​
// larry:  { fullname: "Larry Doe", type: "kindergarten bully", website: "http://www.geocities.com/CapeCanaveral/11451" }​
// searchengine: "https://duckduckgo.com/?q=$1"​
// spitefulness: "9.7"

 

19、国际字符

console.log(/🍎{3}/.test("🍎🍎🍎"));
// → false
console.log(/<.>/.test("<🌹>"));
// → false
console.log(/<.>/u.test("<🌹>"));
// → true

🍎可以视为两个字符,🍎{3} 后面的量词实际针对的是构成🍎的第二个字符,解决方法是在正则后添加u(for Unicode)。然而这可能导致原有的匹配出现问题。

因此,需要在添加u的前提下,继续添加\p{Property=Value}:

console.log(/\p{Script=Greek}/u.test("α"));
// → true
console.log(/\p{Script=Arabic}/u.test("α"));
// → false
console.log(/\p{Alphabetic}/u.test("α"));
// → true
console.log(/\p{Alphabetic}/u.test("!"));
// → false

 

Exercises

① Regexp golf

// Fill in the regular expressions

verify(/ca[rt]/,
       ["my car", "bad cats"],
       ["camper", "high art"]);

verify(/pr?op/,
       ["pop culture", "mad props"],
       ["plop", "prrrop"]);

verify(/ferr(et|y|ari)/,
       ["ferret", "ferry", "ferrari"],
       ["ferrum", "transfer A"]);

verify(/ious\b/,
       ["how delicious", "spacious room"],
       ["ruinous", "consciousness"]);

verify(/\s[.,:;]/,
       ["bad punctuation ."],
       ["escape the period"]);

verify(/\w{7}/,
       ["hottentottententen"],
       ["no", "hotten totten tenten"]);

verify(/\b[^\We]+\b/i,
       ["red platypus", "wobbling nest"],
       ["earth bed", "learning ape", "BEET"]);


function verify(regexp, yes, no) {
  // Ignore unfinished exercises
  if (regexp.source == "...") return;
  for (let str of yes) if (!regexp.test(str)) {
    console.log(`Failure to match '${str}'`);
  }
  for (let str of no) if (regexp.test(str)) {
    console.log(`Unexpected match for '${str}'`);
  }
}

-—————— -- -——-—— -- - -----————------------ -- --     -- - -- —

② Quoting style

let text = "'I'm the cook,' he said, 'it's my job.'";
// Change this call.
console.log(text.replace(/'|([\w]'[\w])/g, str => str == "'" ? '"' : str));
// → "I'm the cook," he said, "it's my job."

课本解答:

let text = "'I'm the cook,' he said, 'it's my job.'";

console.log(text.replace(/(^|\W)'|'(\W|$)/g, '$1"$2'));
// → "I'm the cook," he said, "it's my job."

-—————— -- -——-—— -- - -----————------------ -- --     -- - -- —

③ Numbers again

// Fill in this regular expression.
let number = /^[+-]?(\d+\.?\d*|\d*\.?\d+)([eE][+-]?\d+)?$/;

// Tests:
for (let str of ["1", "-1", "+15", "1.55", ".5", "5.",
                 "1.3e2", "1E-4", "1e+12"]) {
  if (!number.test(str)) {
    console.log(`Failed to match '${str}'`);
  }
}
for (let str of ["1a", "+-1", "1.2.3", "1+1", "1e4.5",
                 ".5.", "1f5", "."]) {
  if (number.test(str)) {
    console.log(`Incorrectly accepted '${str}'`);
  }
}

课本答案(-号最好转义?):

let number = /^[+\-]?(\d+(\.\d*)?|\.\d+)([eE][+\-]?\d+)?$/;

 

posted @ 2018-09-02 12:32  xkfx  阅读(455)  评论(0编辑  收藏  举报