Amazon评论数据的预处理代码,用于情感分析,代码改自
https://github.com/PaddlePaddle/Paddle/tree/develop/demo/quick_start/data
Amazon商品评论数据网址:
http://jmcauley.ucsd.edu/data/amazon/
Bash脚本文件
get_data.sh:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | #!/bin/bash # 1. size of pos : neg = 1:1. # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set. # 3. distinct train set and test set. set - e # Download data echo "Downloading Amazon Electronics reviews data..." # http://jmcauley.ucsd.edu/data/amazon/ # wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz # wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz echo "Downloading mosesdecoder..." # https://github.com/moses-smt/mosesdecoder # wget https://github.com/moses-smt/mosesdecoder/archive/master.zip # unzip master.zip # rm master.zip ################## # Preprocess data echo "Preprocess data..." export LC_ALL = C UNAME_STR = `uname` if [ ${UNAME_STR} = = 'Linux' ]; then SHUF_PROG = 'shuf' else SHUF_PROG = 'gshuf' fi mkdir - p tmp # python preprocess.py -i reviews_Electronics_5.json.gz python preprocess.py - i reviews_Digital_Music_5.json.gz # uniq and shuffle cd tmp echo 'Uniq and shuffle...' cat pos_ * |sort|uniq|${SHUF_PROG}> pos.shuffed cat neg_ * |sort|uniq|${SHUF_PROG}> neg.shuffed min_len = `sed - n '$=' neg.shuffed` echo `sed - n '$=' neg.shuffed` test_num = $((min_len / 10 )) if [ $test_num - gt 12500 ];then test_num = 12500 fi train_num = $((min_len - test_num)) head - n$train_num pos.shuffed >train.pos head - n$train_num neg.shuffed >train.neg tail - n$test_num pos.shuffed >test.pos tail - n$test_num neg.shuffed >test.neg cat train.pos train.neg | ${SHUF_PROG} >.. / train.txt cat test.pos test.neg | ${SHUF_PROG} >.. / test.txt cd - echo 'train.txt' > train. list echo 'test.txt' > test. list # use 30k dict # rm -rf tmp mv dict .txt dict_all.txt cat dict_all.txt | head - n 30001 > dict .txt echo 'Done.' |
数据处理文件:preprocess.py:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | # -*- coding: UTF-8 -*- """ 1. Tokenize the words and punctuation Usage: python preprocess.py -i data_file [random seed] """ import sys import os import operator import gzip from subprocess import Popen, PIPE from optparse import OptionParser import json from multiprocessing import Queue from multiprocessing import Pool import multiprocessing batch_size = 5000 word_count = {} num_tokenize = max ( 1 , multiprocessing.cpu_count() - 2 ) # parse + tokenize + save max_queue_size = 8 parse_queue = Queue(maxsize = max_queue_size + num_tokenize) tokenize_queue = Queue(maxsize = max_queue_size + num_tokenize) def create_dict(data): """ Create dictionary based on data, and saved in data_dir/dict.txt. The first line is unk \t -1. data: list, input data by batch. """ for seq in data: try : for w in seq.lower().split(): if w not in word_count: word_count[w] = 1 else : word_count[w] + = 1 except : sys.stderr.write(seq + "\tERROR\n" ) def parse(path): """ Open .gz file. """ sys.stderr.write(path) g = gzip. open (path, 'r' ) for l in g: yield json.loads(l) g.close() def tokenize(sentences): """ Use tokenizer.perl to tokenize input sentences. tokenizer.perl is tool of Moses. sentences : a list of input sentences. return: a list of processed text. """ dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl' if not os.path.exists( dir ): sys.exit( "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists." ) tokenizer_cmd = [ dir , '-l' , 'en' , '-q' , '-' ] assert isinstance (sentences, list ) text = "\n" .join(sentences) tokenizer = Popen(tokenizer_cmd, stdin = PIPE, stdout = PIPE) tok_text, _ = tokenizer.communicate(text) toks = tok_text.split( '\n' )[: - 1 ] return toks def save_data(instance, data_dir, pre_fix, batch_num): """ save data by batch """ label = [ '1' if pre_fix = = 'pos' else '0' for i in range ( len (instance))] lines = [ '%s\t%s' % (label[i], instance[i]) for i in range ( len (label))] file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num)) file (file_name, 'w' ).write( '\n' .join(lines) + '\n' ) def tokenize_batch( id ): """ tokenize data by batch """ while True : num_batch, instance, pre_fix = parse_queue.get() if num_batch = = - 1 : ### parse_queue finished tokenize_queue.put(( - 1 , None , None )) sys.stderr.write( "Thread %s finish\n" % ( id )) break tokenize_instance = tokenize(instance) tokenize_queue.put((num_batch, tokenize_instance, pre_fix)) sys.stderr.write( '.' ) def save_batch(data_dir, num_tokenize, data_dir_dict): """ save data by batch build dict.txt """ token_count = 0 while True : num_batch, instance, pre_fix = tokenize_queue.get() if num_batch = = - 1 : token_count + = 1 if token_count = = num_tokenize: #### tokenize finished. break else : continue save_data(instance, data_dir, pre_fix, num_batch) create_dict(instance) ## update dict sys.stderr.write( "save file finish\n" ) f = open (data_dir_dict, 'w' ) f.write( '%s\t%s\n' % ( 'unk' , '-1' )) for k, v in sorted (word_count.items(), key = operator.itemgetter( 1 ), \ reverse = True ): f.write( '%s\t%s\n' % (k, v)) f.close() sys.stderr.write( "build dict finish\n" ) def parse_batch(data, num_tokenize): """ parse data by batch parse -> tokenize -> save """ raw_txt = parse(data) neg, pos = [], [] count = 0 sys.stderr.write( "extract raw data\n" ) for l in raw_txt: rating = l[ "overall" ] text = l[ "reviewText" ].lower() # # convert words to lower case if rating = = 5.0 and text: pos.append(text) if rating < 3.0 and text: neg.append(text) if len (pos) = = batch_size or len (neg) = = batch_size: if len (pos) = = batch_size: batch = pos pre_fix = 'pos' else : batch = neg pre_fix = 'neg' parse_queue.put((count, batch, pre_fix)) count + = 1 if pre_fix = = 'pos' : pos = [] else : neg = [] if len (pos) > 0 : parse_queue.put((count, pos, 'pos' )) count + = 1 if len (neg) > 0 : parse_queue.put((count, neg, 'neg' )) count + = 1 for i in range (num_tokenize): parse_queue.put(( - 1 , None , None )) #### for tokenize's input finished sys.stderr.write( "parsing finish\n" ) def option_parser(): parser = OptionParser(usage = "usage: python preprcoess.py " \ "-i data_path [options]" ) parser.add_option( "-i" , "--data" , action = "store" , dest = "input" , help = "Input data path." ) parser.add_option( "-s" , "--seed" , action = "store" , dest = "seed" , default = 1024 , help = "Set random seed." ) return parser.parse_args() def main(): reload (sys) sys.setdefaultencoding( 'utf-8' ) options, args = option_parser() data = options. input seed = options.seed data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt' ) data_dir = os.path.join(os.path.dirname(data), 'tmp' ) pool = Pool(processes = num_tokenize + 2 ) pool.apply_async(parse_batch, args = (data, num_tokenize)) for i in range (num_tokenize): pool.apply_async(tokenize_batch, args = ( str (i), )) pool.apply_async(save_batch, args = (data_dir, num_tokenize, data_dir_dict)) pool.close() pool.join() file (os.path.join(os.path.dirname(data), 'labels.list' ), 'w' ).write( 'neg\t0\npos\t1\n' ) if __name__ = = '__main__' : main() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧