word2sequence 把字符串转换数字编码
1.准备数据
dataset.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | ''' 准备数据 ''' from torch.utils.data import DataLoader,Dataset import torch import utils import os import config class ImdbDataset(Dataset): def __init__( self ,train = True ): data_path = r "H:\073-nlp自然语言处理-v5.bt38[周大伟]\073-nlp自然语言处理-v5.bt38[周大伟]\第四天\代码\data\aclImdb_v1\aclImdb" super (ImdbDataset, self ).__init__() data_path + = r "\train" if train else r "\test" self .total_path = [] for temp_path in [r "\pos" ,r "\neg" ]: cur_path = data_path + temp_path self .total_path + = [os.path.join(cur_path,i) for i in os.listdir(cur_path) if i.endswith( ".txt" )] def __getitem__( self , idx): file = self .total_path[idx] review = utils.tokenlize( open ( file ,encoding = 'utf-8' ).read()) label = int ( file .split( "_" )[ - 1 ].split( "." )[ 0 ]) # label = 0 if label <5 else 1 return review,label def __len__( self ): return len ( self .total_path) # def collate_fn(batch): # #batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果 # batch = list(zip(*batch)) # labes = torch.tensor(batch[1],dtype=torch.int32) # texts = batch[0] # del batch # return labes,texts def collate_fn(batch): """ 对batch数据进行处理 :param batch: [一个getitem的结果,getitem的结果,getitem的结果] :return: 元组 """ reviews,labels = zip ( * batch) reviews = torch.LongTensor([config.ws.transform(i,max_len = config.max_len) for i in reviews]) labels = torch.LongTensor(labels) return reviews,labels def get_dataloader(train = True ): dataset = ImdbDataset(train) batch_size = config.train_batch_size if train else config.test_batch_size return DataLoader(dataset,batch_size = batch_size,shuffle = True ,collate_fn = collate_fn) if __name__ = = '__main__' : dataset = ImdbDataset() dataloader = DataLoader(dataset = dataset, batch_size = 2 , shuffle = True ,collate_fn = collate_fn) # 3. 观察数据输出结果 for idx, (label, text) in enumerate (dataloader): print ( "idx:" , idx) print ( "table:" , label) print ( "text:" , text) break |
2.conf.py 文件
1 2 3 4 5 6 7 8 9 10 11 | """ 配置文件 """ import pickle train_batch_size = 512 test_batch_size = 500 ws = pickle.load( open ( "./model/ws.pkl" , "rb" )) max_len = 80 |
3.utils.py分词文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import re def tokenlize(sentence): ''' 进行文本分词 :param sentence: :return: ''' fileters = [ '!' , '"' , '#' , '$' , '%' , '&' , '\(' , '\)' , '\*' , '\+' , ',' , '-' , '\.' , '/' , ':' , ';' , '<' , '=' , '>' , '\?' , '@' , '\[' , '\\', ' \] ', ' ^ ', ' _ ', ' ` ', ' \{ ', ' \| ', ' \} ', ' ~ ', ' \t ', ' \n ', ' \x97 ', ' \x96 ', ' ” ', ' “', ] sentence = sentence.lower() sentence = re.sub( "<br />" , " " ,sentence) sentence = re.sub( "|" .join(fileters), " " ,sentence) # result = sentence.split(" ") #去除空字符串 result = [i for i in sentence.split( " " ) if len (i)> 0 ] return result |
4.word2sequence.py 句子中的词转换成数字编码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | ''' 文本序列化 ''' class Word2Sequence: UNK_TAG = "<UNK>" PAD_TAG = "<PAD>" UNK = 0 PAD = 1 def __init__( self ): self . dict = { #保存词语和对应的数字 self .UNK_TAG: self .UNK, self .PAD_TAG: self .PAD } self .count = {} #统计词频的 def fit( self ,sentence): ''' 接受句子,统计词频 :param sentence: :return: ''' for word in sentence: self .count[word] = self .count.get(word, 0 ) + 1 def build_vocab( self ,min_count = 1 ,max_count = None ,max_feature = None ): ''' 根据条件构造 词典 :param min_count: 最小词频 :param max_count: 最大词频 :param max_feature: 最大词语数,这个参数会排序 :return: ''' if min_count is not None : self .count = {word:count for word,count in self .count.items() if count > = min_count} if max_count is not None : self .count = {word:count for word,count in self .count.items() if count < = max_count} if max_feature is not None : self .count = dict ( sorted ( self .count.items(), lambda x:x[ - 1 ],reverse = True )[:max_feature]) for word in self .count.keys(): self . dict [word] = len ( self . dict ) #获取每个词及生成每个词对应的编号 #字典翻转,键→值,值←键 self .inverse_dict = dict ( zip ( self . dict .values(), self . dict .keys())) def transform( self ,sentence,max_len = None ): ''' 把句子转化为数字序列 :param sentense: [str,str,,,,,,,,,,] :return: [num,num,num,,,,,,,] ''' if len (sentence) > max_len: sentence = sentence[:max_len] else : sentence = sentence + [ self .PAD_TAG] * (max_len - len (sentence)) return [ self . dict .get(i, 0 ) for i in sentence] def inverse_transform( self ,incides): ''' 把数字序列转化为字符 :param incides: [num,num,num,,,,,,,,] :return: [str,str,str,,,,,,,] ''' return [ self .inverse_dict.get(i, "<UNK>" ) for i in incides] if __name__ = = '__main__' : sentences = [[ '今天' , '天气' , '很' , '好' ], [ '今天' , '去' , '吃' , '什么' ]] ws = Word2Sequence() for sentence in sentences: ws.fit(sentence) ws.build_vocab() print (ws. dict ) ret = ws.transform([ "好" , "好" , "好" , "好" , "好" , "好" , "好" , "热" , "呀" ],max_len = 20 ) print (ret) ret = ws.inverse_transform(ret) print (ret) |
5. main主文件,把文件中的词转换成数字编码并保存
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | ''' 文本序列化及保存模型 ''' from word_sequence import Word2Sequence from dataset import get_dataloader import pickle from tqdm import tqdm if __name__ = = '__main__' : ws = Word2Sequence() dl_train = get_dataloader( True ) dl_test = get_dataloader( False ) for label,reviews in tqdm(dl_train,total = len (dl_train)): for review in reviews: ws.fit(review) for label,reviews in tqdm(dl_test,total = len (dl_train)): for review in reviews: ws.fit(review) ws.build_vocab() pickle.dump(ws, open ( "./model/ws.pkl" , "wb" )) |
多思考也是一种努力,做出正确的分析和选择,因为我们的时间和精力都有限,所以把时间花在更有价值的地方。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统
· 【译】Visual Studio 中新的强大生产力特性
· 2025年我用 Compose 写了一个 Todo App