Shell编程之文本处理

cut 截取自定列

可以按照某个字符进行分割,然后取出其中的指定列:

[root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt
140.205.201.30 - - [02/Dec/2017:00:15:24 +0800] "GET / HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:17:51 +0800] "GET /rs-status HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:06 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "GET /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:07 +0800] "POST /phpmyadmin/ HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:19:09 +0800] "GET /ganglia/index.php HTTP/1.1" 404 -
164.132.91.1 - - [02/Dec/2017:00:22:21 +0800] "GET / HTTP/1.1" 404 -
114.215.45.101 - - [02/Dec/2017:00:23:43 +0800] "GET / HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:32:41 +0800] "GET /index.php HTTP/1.1" 404 -
140.205.201.30 - - [02/Dec/2017:00:39:08 +0800] "GET /jobs/ HTTP/1.1" 404 -
[root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt |cut -d ' ' -f 6
"GET
"GET
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"POST
"GET
"GET
"GET
"GET
"GET

可以指定更多的列:

[root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt |cut -d ' ' -f 2,3,4
- - [02/Dec/2017:00:15:24
- - [02/Dec/2017:00:17:51
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:06
- - [02/Dec/2017:00:19:07
- - [02/Dec/2017:00:19:07
- - [02/Dec/2017:00:19:07
- - [02/Dec/2017:00:19:07
- - [02/Dec/2017:00:19:07
- - [02/Dec/2017:00:19:09
- - [02/Dec/2017:00:22:21
- - [02/Dec/2017:00:23:43
- - [02/Dec/2017:00:32:41
- - [02/Dec/2017:00:39:08
[root@iz8vbbqbnh4ug2q9so5jflz logs]# cat  localhost_access_log.2017-12-02.txt |cut -d ' ' -f 2,3,6-
- - "GET / HTTP/1.1" 404 -
- - "GET /rs-status HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /phpmyadmin/ HTTP/1.1" 404 -
- - "POST /phpmyadmin/ HTTP/1.1" 404 -
- - "GET /ganglia/index.php HTTP/1.1" 404 -
- - "GET / HTTP/1.1" 404 -
- - "GET / HTTP/1.1" 404 -
- - "GET /index.php HTTP/1.1" 404 -
- - "GET /jobs/ HTTP/1.1" 404 -

 sort 对列进行排序

例如,对tomcat访问日志,对请求响应返回大小进行排序:

cat localhost_access_log.2017-12-01.txt |sort -t ' ' -k 10

 

-t : 指定分隔符

-k : 指定排序的列

114.241.108.197 - - [01/Dec/2017:09:03:45 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
114.241.108.197 - - [01/Dec/2017:11:45:30 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
114.241.108.197 - - [01/Dec/2017:14:41:04 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
223.72.82.98 - - [01/Dec/2017:15:26:10 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
59.108.217.106 - - [01/Dec/2017:09:35:17 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
59.108.217.106 - - [01/Dec/2017:13:08:46 +0800] "GET /js/plugin/jquery-file-upload/js/vendor/tmpl.min.js HTTP/1.1" 200 977
114.241.108.197 - - [01/Dec/2017:09:03:32 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
114.241.108.197 - - [01/Dec/2017:11:28:29 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
114.241.108.197 - - [01/Dec/2017:14:40:51 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
223.72.82.98 - - [01/Dec/2017:15:26:03 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
59.108.217.106 - - [01/Dec/2017:09:35:01 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
59.108.217.106 - - [01/Dec/2017:09:35:10 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
59.108.217.106 - - [01/Dec/2017:13:08:52 +0800] "GET /img/logo-pale.png HTTP/1.1" 200 9775
114.241.108.197 - - [01/Dec/2017:12:00:15 +0800] "GET /interview/detail.do?manageKey=15ba76c6fbeeccd2f8df875379ac88e9&targetPanel=dialog HTTP/1.1" 200 9952
59.108.217.106 - - [01/Dec/2017:16:44:53 +0800] "GET /interview/detail.do?manageKey=15ba76c6fbeeccd2f8df875379ac88e9&targetPanel=dialog HTTP/1.1" 200 9952
59.108.217.106 - - [01/Dec/2017:16:44:57 +0800] "GET /interview/detail.do?manageKey=15ba76c6fbeeccd2f8df875379ac88e9&targetPanel=dialog HTTP/1.1" 200 9952

 

排序是由方向的,默认是升序排序,如果要降序排列,可以在列号后面增加一个r:

cat localhost_access_log.2017-12-01.txt |sort -t ' ' -k 10r

 

最后要注意的是,这里的排序默认是按字符串的字典顺序排列的,如果要按其数值拍,则需要增加一个n:

 cat localhost_access_log.2017-12-01.txt |sort -t ' ' -k 10n

 

114.241.108.197 - - [01/Dec/2017:09:03:28 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
114.241.108.197 - - [01/Dec/2017:11:28:29 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
114.241.108.197 - - [01/Dec/2017:14:40:49 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
223.72.82.98 - - [01/Dec/2017:15:25:59 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
59.108.217.106 - - [01/Dec/2017:09:34:56 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
59.108.217.106 - - [01/Dec/2017:09:35:06 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
59.108.217.106 - - [01/Dec/2017:13:08:43 +0800] "GET /css/smartadmin-production.css HTTP/1.1" 200 394554
112.65.193.14 - - [01/Dec/2017:11:28:44 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
114.241.108.197 - - [01/Dec/2017:09:03:30 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
114.241.108.197 - - [01/Dec/2017:11:28:33 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
114.241.108.197 - - [01/Dec/2017:14:40:49 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
223.72.82.98 - - [01/Dec/2017:15:26:01 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
59.108.217.106 - - [01/Dec/2017:09:34:56 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
59.108.217.106 - - [01/Dec/2017:09:35:06 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844
59.108.217.106 - - [01/Dec/2017:13:08:43 +0800] "GET /js/jqueryui/1.10.3/jquery-ui.min.js HTTP/1.1" 200 435844

 

 由此可见,此网站最大的静态资源是这个jquery-ui.min.js文件。

uniq去重

 cat localhost_access_log.2017-12-01.txt |cut -d ' ' -f 1,10 |sort -t ' ' -k 2n,1|uniq

 

223.72.82.98 61692
59.108.217.106 61692
114.241.108.197 95786
223.72.82.98 95786
59.108.217.106 95786
114.241.108.197 116060
223.72.82.98 116060
59.108.217.106 116060
112.65.193.14 284394
114.241.108.197 284394
223.72.82.98 284394
59.108.217.106 284394
114.241.108.197 394554
223.72.82.98 394554
59.108.217.106 394554
112.65.193.14 435844
114.241.108.197 435844
223.72.82.98 435844
59.108.217.106 435844

 

 

wc统计

[root@iZ25klm6k7uZ logs]# wc -l localhost_access_log.2017-12-01.txt  统计行数
1967 localhost_access_log.2017-12-01.txt
[root@iZ25klm6k7uZ logs]# wc -w localhost_access_log.2017-12-01.txt  统计词数
19670 localhost_access_log.2017-12-01.txt
[root@iZ25klm6k7uZ logs]# wc -m localhost_access_log.2017-12-01.txt  共计字符数
219011 localhost_access_log.2017-12-01.txt
[root@iZ25klm6k7uZ logs]# 

 

sed正则查找

用sed来查找500的日志信息:

[root@iZ25klm6k7uZ logs]# sed -n '/\b500\b/p' localhost_access_log.2017-12-01.txt
119.127.17.97 - - [01/Dec/2017:14:23:18 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:23:24 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:24:12 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:31:11 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:49:51 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:49:57 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:55:45 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:58:03 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
59.108.217.106 - - [01/Dec/2017:15:00:22 +0800] "POST /interview/add.do HTTP/1.1" 500 19582

 

注意:-n和-p配合,表示只打印匹配的行。

 

awk正则匹配

用awk来查找500日志信息:

awk '($9 ~ /500/)' localhost_access_log.2017-12-01.txt 

 

输出和上面的sed一样。

zwk有默认的分隔符,比如\t,空格等。如果要指定分隔符可以用-F。

zwk的强大之处在于它支持编程,格式如下:

awk pattern { action } 例如上面的查找500日志可以完整表达如下:

[root@iZ25klm6k7uZ logs]# awk -F ' ' '($9 ~ /500/){print }' localhost_access_log.2017-12-01.txt 
119.127.17.97 - - [01/Dec/2017:14:23:18 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:23:24 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:24:12 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:31:11 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:49:51 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:49:57 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:55:45 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
119.127.17.97 - - [01/Dec/2017:14:58:03 +0800] "POST /interview/add.do HTTP/1.1" 500 19582
59.108.217.106 - - [01/Dec/2017:15:00:22 +0800] "POST /interview/add.do HTTP/1.1" 500 19582

 

同时查找500和404的日志:

awk -F ' ' '($9 ~ /500/ || $9 ~ /404/){print $1,$6,$7,$9}' localhost_access_log.2017-12-01.txt

 

或者

awk -F ' ' '($9 ~ /500|404|400/){print $1,"-",$4,"-",$6,"-",$9}' localhost_access_log.2017-12-01.txt

 

posted @ 2017-12-05 00:57  at0x7c00  阅读(375)  评论(0编辑  收藏  举报
CSDN - ITeye - GitHub  |  HuQiao's Blog  |  业余经营:IT快报