1. 用awk的match匹配URL,注意一下如果匹配带/开头或者/结尾的字符串时,需要处理一下,变成[/],对于分号也需要处理,采用ASCII码\\x3B
j = foreach i generate time,city,user,referer; k = stream j through `awk ' { refer=$4 tt="" if(match(refer,"http://sou.zhaopin.com")) tt="职位搜索频道页" else if(match(refer,"http://sou.zhaopin.com/jobs[/]")) tt="职位搜索结果页" if(length(tt) > 0) { split($2,arrcity,"\\x3B") if(length(arrcity) > 1) print tt"\t选择了多个地点\t"$1"\t"$3 tmp="广东,湖北,陕西,四川,辽宁,吉林,江苏,山东,浙江,广西,安徽,河北,山西,内蒙,黑龙江,福建,江西,河南,湖南,海南,贵州,云南,西藏,甘肃,青海,宁夏,新疆,新疆维吾尔自治区,香港特别行政区,澳门特别行政区,台湾省" b="false" for(i=1;i<=length(arrcity);i++) { city1=arrcity[i] if(city1 != "台湾省" && substr(city1,length(city1)) == "省") city1=substr(city1,1,length(city1)-1) if(match(tmp,city1)) { b="true" break; } } if(b == "true") print tt"\t直接选择省\t"$1"\t"$3 } }'`;
2.使用awk匹配URL,注意里面的"?"符号,需要处理变成"[?]"
b = foreach h3 generate time,site,path,((referer is null)?'NA':referer) as referer:chararray,user; i = stream b through `awk ' { domain=$2 url=$3 ref=$4 if((domain == "my.zhaopin.com") && match(url,"[/]myzhaopin/resume_nav.asp[?]nr=yes")) print $1"\thttp://my.zhaopin.com/myzhaopin/resume_nav.asp?nr=yes\tNA\t"$5 if((domain == "my.zhaopin.com") && match(url,"[/]myzhaopin/resume_baseinfo.asp") && (match(ref,"http://my.zhaopin.com/myzhaopin/resume_nav.asp[?]nr=yes"))) print $1"\thttp://my.zhaopin.com/myzhaopin/resume_baseinfo.asp\thttp://my.zhaopin.com/myzhaopin/resume_nav.asp?nr=yes\t"$5 }'`;
3.使用awk匹配城市,区分单城市,单省份,多地点
11.txt文件内容:
530;586;785;
浙江
台湾省;北京;上海
538
(赣州
深圳
上海;(湖北)
青海省
湖北省;陕西
a = load '11.txt' as (city:chararray); b = foreach a generate city; c = stream b through `awk ' { tmp="广东,湖北,陕西,四川,辽宁,吉林,江苏,山东,浙江,广西,安徽,河北,山西,内蒙,黑龙江,福建,江西,河南,湖南,海南,贵州,云南,西藏,甘肃,青海,宁夏,新疆,新疆维吾尔自治区,香港特别行政区,澳门特别行政区,台湾省" split($1,arrcity,"\\x3B") if(length(arrcity) == 1) { citytmp=arrcity[1] gsub("([\)]|[\(])*","",citytmp) if(citytmp != "台湾省" && substr(citytmp,length(citytmp)) == "省") citytmp=substr(citytmp,1,length(city1)-1) if(match(tmp,citytmp)) print "选择了单个省\t"$1 else if(match(citytmp,"[^0-9A-Za-z]+")) print "选择了单个城市\t"$1 } else { b=0 for(i=1;i<=length(arrcity);i++) { city1=arrcity[i] gsub("([\)]|[\(])*","",city1) if(city1 != "台湾省" && substr(city1,length(city1)) == "省") city1=substr(city1,1,length(city1)-1) if(match(tmp,city1)) { b=b+1 } else if(match(city1,"[^0-9A-Za-z]+")) { b=b+1 } } if(b > 1) print "多选地点-包括城市和省\t"$1 } }'`; dump c;