搜狗语料库数据整编
1 #!/bin/bash 2 if [ -z "$1" ] ; then 3 echo "请指定输入目录" 4 exit 1 5 elif [ ! -d $1 ] ; then 6 echo "[$1]不是一个合法的输入路径" 7 exit 1 8 fi 9 if [ -z "$2" ] ; then 10 echo "请指定输出目录" 11 exit 1 12 elif [ ! -d $2 ] ; then 13 echo "[$2]不是一个合法的输出目录" 14 exit 1 15 fi 16 17 echo "处理开始" 18 19 for filename in `ls $1` 20 do 21 if [ ! -f "$1/$filename" ] ; then 22 echo "[$filename]不是一个合法的文件!将略过" 23 continue 24 fi 25 echo $filename 26 iconv -f gbk -t utf-8 -c "$1/$filename"| awk '{if(NR%6==1||NR%6==5){ORS=""}else{ORS=NR%6?"|":"\n"};gsub("</?doc>|</?url>|</?docno>|</?contenttitle>|</?content>","",$0);print}' >> "$2/$filename" 27 done 28 29 echo "处理结束"