搜索关键词统计
#!/bin/bash # 统计搜索词 分析171,173上的日志 sourceDir="/export/manager/kmsearch/log/wordlog" tmpDateFile="/tmp/search_wordlog_tmp.txt" tmpSearchWordlog="/tmp/search_wordlog" # 分析获取哪些日志文件 startDate="2015-05-04" startTimeStamp=`date -d "$startDate" +%s` endDate="2015-12-31" endTimeStamp=`date -d "$endDate" +%s` echo "" > $tmpDateFile for((i=$startTimeStamp; i<=$endTimeStamp; i=i+86400)) do dateStr=`date -d @$i "+%Y-%m-%d"` echo "$dateStr.txt" >> $tmpDateFile done #下载 171 echo "downloading from 171..." dateArr=$(cat $tmpDateFile ) for tmpStr in ${dateArr[@]} do scp root@10.15.200.171:$sourceDir/$tmpStr $tmpSearchWordlog/171/ done #173 echo "downloading from 173..." dateArr=$(cat $tmpDateFile ) for tmpStr in ${dateArr[@]} do scp root@10.15.200.173:$sourceDir/$tmpStr $tmpSearchWordlog/173/ done #输出到同一个文件 echo "combine all data... " echo '' > $tmpSearchWordlog/alldata.txt dateArr=$(cat $tmpDateFile ) for tmpStr in ${dateArr[@]} do cat $tmpSearchWordlog/171/$tmpStr >> $tmpSearchWordlog/alldata.txt cat $tmpSearchWordlog/173/$tmpStr >> $tmpSearchWordlog/alldata.txt done #统计 - all #cat $tmpSearchWordlog/alldata.txt | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -100 | awk '{print $1"\t"$2" "$3}' > $tmpSearchWordlog/allTop.txt #exit #拆分文件 3,000,000行 <200M cd $tmpSearchWordlog find . -name 'part.alldata.txt*' | xargs rm -rf split -l3000000 alldata.txt part.alldata.txt allPartFiles=`find . -name "part.alldata.txt*"` for tmpStr in $allPartFiles do cat $tmpStr | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -900 | awk '{print $1"\t"$2" "$3}' > ${tmpStr}_Tops.txt & done echo 'waiting 1分钟...' sleep 60 # 整合统计 find . -name 'part.alldata.txt*_Tops.txt' | xargs cat | awk '{print $2"\t"$1}' | tr '[A-Z]' '[a-z]' | sort > partsAllTops.txt # 关键词统计 awk '{a[$1]+=$2;}END{for(i in a){print i,a[i];}}' partsAllTops.txt | awk '{print $2"\t"$1}' | sort -rn | grep -v 'www.' | grep -v 'http:' > statistic.result