搜狗语料库数据整编

 1 #!/bin/bash
 2 if [ -z "$1" ] ; then
 3   echo "请指定输入目录"
 4   exit 1
 5 elif [ ! -d $1 ] ; then
 6   echo "[$1]不是一个合法的输入路径"
 7   exit 1
 8 fi
 9 if [ -z "$2" ] ; then
10   echo "请指定输出目录"
11   exit 1
12 elif [ ! -d $2 ] ; then
13   echo "[$2]不是一个合法的输出目录"
14   exit 1
15 fi
16 
17 echo "处理开始"
18 
19 for filename in `ls $1`
20   do
21     if [ ! -f "$1/$filename" ] ; then
22        echo "[$filename]不是一个合法的文件!将略过"
23        continue
24     fi
25     echo $filename
26     iconv -f gbk -t utf-8 -c "$1/$filename"| awk '{if(NR%6==1||NR%6==5){ORS=""}else{ORS=NR%6?"|":"\n"};gsub("</?doc>|</?url>|</?docno>|</?contenttitle>|</?content>","",$0);print}' >> "$2/$filename"
27   done
28 
29 echo "处理结束"

 

posted @ 2017-08-31 15:28  aj117  阅读(637)  评论(0编辑  收藏  举报