Eloquent JavaScript #09# Regular Expressions
Notes
js创建正则表达式的两种等价写法:
let re1 = new RegExp("abc"); let re2 = /abc/;
console.log(/abc/.test("abcde")); // → true console.log(/abc/.test("abxde")); // → false
\d |
Any digit character |
\w |
An alphanumeric character (“word character”) |
\s |
Any whitespace character (space, tab, newline, and similar) |
\D |
A character that is not a digit |
\W |
A nonalphanumeric character |
\S |
A nonwhitespace character |
. |
Any character except for newline |
/abc/ |
A sequence of characters |
/[abc]/ |
Any character from a set of characters |
/[^abc]/ |
Any character not in a set of characters |
/[0-9]/ |
Any character in a range of characters |
/x+/ |
One or more occurrences of the pattern x |
/x+?/ |
One or more occurrences, nongreedy |
/x*/ |
Zero or more occurrences |
/x?/ |
Zero or one occurrence |
/x{2,4}/ |
Two to four occurrences |
/(abc)/ |
A group |
/a|b|c/ |
Any one of several patterns |
/\d/ |
Any digit character |
/\w/ |
An alphanumeric character (“word character”) |
/\s/ |
Any whitespace character |
/./ |
Any character except newlines |
/\b/ |
A word boundary |
/^/ |
Start of input |
/$/ |
End of input |
\d等转移字符可以放在 [ ] 里而不丧失含义,但是 . 和+ 之类的特殊符号不行,会变为普通的符号。
整体取反,非0非1:
let notBinary = /[^01]/; console.log(notBinary.test("1100100010100110")); // → false console.log(notBinary.test("1100100010200110")); // → true
+ one or more,* zero or more
console.log(/'\d+'/.test("'123'")); // → true console.log(/'\d+'/.test("''")); // → false console.log(/'\d*'/.test("'123'")); // → true console.log(/'\d*'/.test("''")); // → true
? zero or one
let neighbor = /neighbou?r/; console.log(neighbor.test("neighbour")); // → true console.log(neighbor.test("neighbor")); // → true
{2} a pattern should occur a precise number of times,It is also possible to specify a range this way: {2,4}
means the element must occur at least twice and at most four times.
let dateTime = /\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{2}/; console.log(dateTime.test("1-30-2003 8:45")); // → true
You can also specify open-ended ranges when using braces by omitting the number after the comma. So, {5,}
means five or more times.
括号内的n个元素被视作一个整体元素(分组,子表达式):
let cartoonCrying = /boo+(hoo+)+/i; console.log(cartoonCrying.test("Boohoooohoohooo")); // → true
i表示该表达式大小写不敏感。
可以让我们获取额外的信息:
let match = /\d+/.exec("one two 100"); console.log(match); // → ["100"] console.log(match.index); // → 8
exec的返回值:匹配失败为null,成功则如上所示。
等价写法:
console.log("one two 100".match(/\d+/)); // → ["100"]
含括号表达式的情况:
let quotedText = /'([^']*)'/; console.log(quotedText.exec("she said 'hello'")); // → ["'hello'", "hello"] console.log(/bad(ly)?/.exec("bad")); // → ["bad", undefined] console.log(/(\d)+/.exec("123")); // → ["123", "3"]
返回数组的第一个元素为整个正则表达式匹配的字符串,而第二元素为() 内正则(子表达式)匹配的字符串(没有就是undefined,多个就取最后一个)。容易知道,第二个元素几乎总是第一个元素的子集。
console.log(new Date()); // → Sat Sep 01 2018 13:54:43 GMT+0800 (中国标准时间) console.log(new Date(2009, 11, 9)); // → Wed Dec 09 2009 00:00:00 GMT+0800 (中国标准时间) console.log(new Date(2009, 11, 9, 12, 59, 59, 999)); // → Wed Dec 09 2009 12:59:59 GMT+0800 (中国标准时间) console.log(new Date(1997, 10, 19).getTime()); // → 879868800000 console.log(new Date(1387407600000)); // → Thu Dec 19 2013 07:00:00 GMT+0800 (中国标准时间) console.log(new Date().getTime()); // → 1535781283593 console.log(Date.now()); // → 1535781283593
通过正则表达式,由String创建日期:
"use strict"; function getDate(string) { let [_, month, day, year] = /(\d{1,2})-(\d{1,2})-(\d{4})/.exec(string); return new Date(year, month - 1, day); } console.log(getDate("1-30-2003")); // → Thu Jan 30 2003 00:00:00 GMT+0100 (CET)
PS. 下划线除了用来占位外没有其它含义。
利用 ^ 和 $ 。例如/^\d+$/匹配完全由数字构成的字符串,/^!/
匹配由!开头的字符串,而/x^/
啥也匹配不了。
用 \b 标注单词边界:
console.log(/cat/.test("concatenate")); // → true console.log(/\bcat\b/.test("concatenate")); // → false console.log(/\bcat\b/.test("xx cat xx")); // → true
let animalCount = /\b\d+ (pig|cow|chicken)s?\b/; console.log(animalCount.test("15 pigs")); // → true console.log(animalCount.test("15 pigchickens")); // → false
当你进行正则匹配时(test或者exec),正则引擎将从所给字符串的开头开始尝试匹配,接着是第二个字符,第三个字符... 试图在所给字符串中寻找一个匹配,直到找到一个匹配项或者到达字符串末尾结束。要么返回第一个匹配,要么什么都匹配不到。
/** * 模拟用正则\b\d+ (pig|cow|chicken)s?\b * 匹配"the 3 pigs" */ const str = "the 3 pigs"; function simulateRegex(str, start) { const digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; // 逐个字符尝试匹配,直到找到一个匹配项或者到达字符串尾结束 for(let currentPosition = start; currentPosition != str.length; ++currentPosition) { let tempPosition = currentPosition; if(tempPosition == 0 || str[tempPosition - 1] == " ") {} else continue; // 匹配单词边界通过,前面是标点也是可以的。。 if(!str[tempPosition++] in digits) continue; // 至少匹配一个数字通过 while(str[tempPosition] in digits) { tempPosition++; // 尝试继续匹配数字 } if(str[tempPosition++] != " ") continue; // 匹配一个空格通过 let tempWord; if(str.slice(tempPosition, tempPosition + "pig".length) === (tempWord = "pig") || str.slice(tempPosition, tempPosition + "cow".length) === (tempWord = "cow") || str.slice(tempPosition, tempPosition + "chicken".length) === (tempWord = "chicken")) { tempPosition += tempWord.length; } else { continue; } // 单词匹配成功 if(str[tempPosition] == "s") tempPosition++; // 有没s都可以 if(tempPosition == str.length || str[tempPosition] == " ") { // 最后的单词边界 let match = [str.slice(currentPosition, tempPosition + 1)]; return match; } } return null; } let match = simulateRegex(str, 4); console.log(match); // → ["3 pigs"]
正则引擎在进行分支匹配(|)或重复匹配(+ *)时,如果发现无法继续再继续往下匹配,就会进行“回溯”。
在进行分支匹配时,如果第一个分支就匹配成功,就不再匹配其它分支,如果不成功就会回溯到分支的入口,进入到另外一个分支继续匹配。
而进行重复匹配时,例如说/^.*x/用匹配"abcxe",.*会首先把所有字符消费干净,当正则引擎发现最后还需要一个x时,*操作符会尝试少匹配一个字符,但是仍然没发现x,于是继续回溯,直到发现x,最终得到字符串abc。
replace配合正则:
console.log("papa".replace("p", "m")); // → mapa console.log("Borobudur".replace(/[ou]/, "a")); // → Barobudur console.log("Borobudur".replace(/[ou]/g, "a")); // g代表global全部 // → Barabadar
replace的真正强大之处在于可以用“$数字”引用匹配字符串:
console.log( "Liskov, Barbara\nMcCarthy, John\nWadler, Philip" .replace(/(\w+), (\w+)/g, "$2 $1")); // → Barbara Liskov // John McCarthy // Philip Wadler "hello, word, every, one".replace(/(\w+),/g, "$1 "); // “$+数字”引用匹配中的分组 // → "hello word every one" "hello, word, every, one".replace(/one/g, "$& $&"); // “$&”引用整个匹配 // → "hello, word, every, one one"
还可以传入函数:
"hello, word, every, one".replace(/(\w+),/g, str => str.toUpperCase()); // → "HELLO, WORD, EVERY, one"
let stock = "1 lemon, 2 cabbages, and 101 eggs"; function minusOne(match, amount, unit) { amount = Number(amount) - 1; if (amount == 1) { // only one left, remove the 's' unit = unit.slice(0, unit.length - 1); } else if (amount == 0) { amount = "no"; } return amount + " " + unit; } console.log(stock.replace(/(\d+) (\w+)/g, minusOne)); // → no lemon, 1 cabbage, and 100 eggs
function stripComments(code) { return code.replace(/\/\/.*|\/\*[^]*\*\//g, ""); } console.log(stripComments("1 + /* 2 */3")); // → 1 + 3 console.log(stripComments("x = 10;// ten!")); // → x = 10; console.log(stripComments("1 /* a */+/* b */ 1")); // → 1 1
可以用replace来去掉代码中的所有注释。
[^]可以匹配任何字符,因为 /**/可能跨多行,句点 . 无法匹配换行符号。
然而上面最后一行代码结果却出错了,这是为什么呢?
因为(+
, *
, ?
, and {}
) 这些操作符号都是贪婪的,就像“回溯”里面提到的,它们总是先尽可能地消费字符,直到无路可走才会回头,这样理所当然会匹配到更长的那一个。解决方案就是在这些符号后面加问号 (+?
, *?
, ??
, {}?
),这样它们就会匹配尽可能少的字符串。
function stripComments(code) { return code.replace(/\/\/.*|\/\*[^]*?\*\//g, ""); } console.log(stripComments("1 /* a */+/* b */ 1")); // → 1 + 1
当要用到重复匹配符时,先考虑用非贪婪版本的。
利用new RegExp(拼接字符串, "gi")构建,gi表示global替换全部和大小写不敏感。
let name = "harry"; let text = "Harry is a suspicious character."; let regexp = new RegExp("\\b(" + name + ")\\b", "gi"); console.log(text.replace(regexp, "_$1_")); // → _Harry_ is a suspicious character. let name = "dea+hl[]rd"; let text = "This dea+hl[]rd guy is super annoying."; let escaped = name.replace(/[\\[.+*?(){|^$]/g, "\\$&"); // escaped → "dea\+hl\[]rd" let regexp = new RegExp("\\b" + escaped + "\\b", "gi"); console.log(text.replace(regexp, "_$&_")); // → This _dea+hl[]rd_ guy is super annoying.
正则版indexof:
console.log(" word".search(/\S/)); // → 2 console.log(" ".search(/\S/)); // → -1
需求:设置从字符串的某个字符开始匹配
问题:没有方便的办法
理由:不方便正是js的特性。。。。
解决方案:在【严格的条件】下用lastIndex设定起始位置
严格的条件:表达式必须开启g(global)或者s(sticky)选项,并且必须通过exec方式执行匹配。
lastIndex:正则对象的一个属性,数字,决定了下一个匹配从第几个字符开始。在严格条件 ↑ 下设定才有效。非严格条件下改变该值是毫无作用的。
let pattern = /y/g; pattern.lastIndex = 3; let match = pattern.exec("xyzzy"); console.log(match.index); // → 4 console.log(pattern.lastIndex); // → 5
仅global:匹配成功,自动更新lastIndex为匹配成功位置的下一个位置(如上),匹配失败,lastIndex重新设置为0。
global:从str[lastIndex]开始向后搜索匹配
sticky:从str[lastIndex]直接开始匹配,不向后搜索。
let global = /abc/g; console.log(global.exec("xyz abc")); // → ["abc"] let sticky = /abc/y; console.log(sticky.exec("xyz abc")); // → null
所以只需简单调整一下lastIndex就可以让上面成功的失败、失败的成功:
let global = /abc/g; global.lastIndex = 6; // 从c开始向后搜索匹配 console.log(global.exec("xyz abc")); // → null let sticky = /abc/y; sticky.lastIndex = 4; // 从a开始匹配 console.log(sticky.exec("xyz abc")); // → ["abc"]
因为在global启用时,LastIndex在匹配完之后是要自动更新的,所以,当用一个正则对象匹配多次的时候就会出现坑爹的结果:
let digit = /\d/g; console.log(digit.exec("here it is: 1")); // → ["1"] console.log(digit.exec("and now: 1")); // → null
在s启用,或者啥也不启用时不会有这方面的顾虑。
global的另外一方面影响在于,它改变了match的行为:
console.log("Banana".match(/an/g)); // → ["an", "an"] console.log(/an/g.exec("Banana")); // → ["an", index: 1, input: "Banana", groups: undefined] // global改变了match的行为,本来上述两个 // 输出应该相同的(等价操作),而且["an", "an"] // 后者本应该是子表达式匹配的字符串,前者的子集
总结。。慎用global
利用global模式下的lastIndex机制应该是最简便的方法。
let input = "A string with 3 numbers in it... 42 and 88."; let number = /\b\d+\b/g; let match; while (match = number.exec(input)) { console.log("Found", match[0], "at", match.index); } // → Found 3 at 14 // Found 42 at 33 // Found 88 at 40
function parseINI(string) { // Start with an object to hold the top-level fields let result = {}; let section = result; string.split(/\r?\n/).forEach(line => { let match; if(match = line.match(/^(\w+)=(.*)$/)) { section[match[1]] = match[2]; } else if(match = line.match(/^\[(.*)\]$/)) { section = result[match[1]] = {}; } else if(!/^\s*(;.*)?$/.test(line)) { throw new Error("Line '" + line + "' is not valid."); } }); return result; } console.log(parseINI(` searchengine=https://duckduckgo.com/?q=$1 spitefulness=9.7 ; comments are preceded by a semicolon... ; each section concerns an individual enemy [larry] fullname=Larry Doe type=kindergarten bully website=http://www.geocities.com/CapeCanaveral/11451 [davaeorn] fullname=Davaeorn type=evil wizard outputdir=/home/marijn/enemies/davaeorn`)); // → davaeorn: { fullname: "Davaeorn", type: "evil wizard", outputdir: "/home/marijn/enemies/davaeorn" } // larry: { fullname: "Larry Doe", type: "kindergarten bully", website: "http://www.geocities.com/CapeCanaveral/11451" } // searchengine: "https://duckduckgo.com/?q=$1" // spitefulness: "9.7"
console.log(/🍎{3}/.test("🍎🍎🍎")); // → false console.log(/<.>/.test("<🌹>")); // → false console.log(/<.>/u.test("<🌹>")); // → true
🍎可以视为两个字符,🍎{3} 后面的量词实际针对的是构成🍎的第二个字符,解决方法是在正则后添加u(for Unicode)。然而这可能导致原有的匹配出现问题。
因此,需要在添加u的前提下,继续添加\p{Property=Value}:
console.log(/\p{Script=Greek}/u.test("α")); // → true console.log(/\p{Script=Arabic}/u.test("α")); // → false console.log(/\p{Alphabetic}/u.test("α")); // → true console.log(/\p{Alphabetic}/u.test("!")); // → false
Exercises
// Fill in the regular expressions verify(/ca[rt]/, ["my car", "bad cats"], ["camper", "high art"]); verify(/pr?op/, ["pop culture", "mad props"], ["plop", "prrrop"]); verify(/ferr(et|y|ari)/, ["ferret", "ferry", "ferrari"], ["ferrum", "transfer A"]); verify(/ious\b/, ["how delicious", "spacious room"], ["ruinous", "consciousness"]); verify(/\s[.,:;]/, ["bad punctuation ."], ["escape the period"]); verify(/\w{7}/, ["hottentottententen"], ["no", "hotten totten tenten"]); verify(/\b[^\We]+\b/i, ["red platypus", "wobbling nest"], ["earth bed", "learning ape", "BEET"]); function verify(regexp, yes, no) { // Ignore unfinished exercises if (regexp.source == "...") return; for (let str of yes) if (!regexp.test(str)) { console.log(`Failure to match '${str}'`); } for (let str of no) if (regexp.test(str)) { console.log(`Unexpected match for '${str}'`); } }
-—————— -- -——-—— -- - -----————------------ -- -- -- - -- —
let text = "'I'm the cook,' he said, 'it's my job.'"; // Change this call. console.log(text.replace(/'|([\w]'[\w])/g, str => str == "'" ? '"' : str)); // → "I'm the cook," he said, "it's my job."
课本解答:
let text = "'I'm the cook,' he said, 'it's my job.'"; console.log(text.replace(/(^|\W)'|'(\W|$)/g, '$1"$2')); // → "I'm the cook," he said, "it's my job."
-—————— -- -——-—— -- - -----————------------ -- -- -- - -- —
// Fill in this regular expression. let number = /^[+-]?(\d+\.?\d*|\d*\.?\d+)([eE][+-]?\d+)?$/; // Tests: for (let str of ["1", "-1", "+15", "1.55", ".5", "5.", "1.3e2", "1E-4", "1e+12"]) { if (!number.test(str)) { console.log(`Failed to match '${str}'`); } } for (let str of ["1a", "+-1", "1.2.3", "1+1", "1e4.5", ".5.", "1f5", "."]) { if (number.test(str)) { console.log(`Incorrectly accepted '${str}'`); } }
课本答案(-号最好转义?):
let number = /^[+\-]?(\d+(\.\d*)?|\.\d+)([eE][+\-]?\d+)?$/;