numpy手撕
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import numpy as np def gelu(x): return 0.5 * x * ( 1 + np.tanh(np.sqrt( 2 / np.pi) * (x + 0.044715 * x * * 3 ))) def softmax(x): exp_x = np.exp(x - np. max (x, axis = - 1 , keepdims = True )) return exp_x / np. sum (exp_x, axis = - 1 , keepdims = True ) def layer_norm(x, g, b, eps: float = 1e - 5 ): mean = np.mean(x, axis = - 1 , keepdims = True ) variance = np.var(x, axis = - 1 , keepdims = True ) return g * (x - mean) / np.sqrt(variance + eps) + b def linear(x, w, b): return x @ w + b def ffn(x, c_fc, c_proj): return linear(gelu(linear(x, * * c_fc)), * * c_proj) def attention(q, k, v, mask): return softmax(q @ k.T / np.sqrt(q.shape[ - 1 ]) + mask) @ v def mha(x, c_attn, c_proj, n_head): x = linear(x, * * c_attn) qkv_heads = list ( map ( lambda x: np.split(x, n_head, axis = - 1 ), np.split(x, 3 , axis = - 1 ))) casual_mask = ( 1 - np.tri(x.shape[ 0 ])) * - 1e10 out_heads = [attention(q, k, v, casual_mask) for q, k, v in zip ( * qkv_heads)] x = linear(np.hstack(out_heads), * * c_proj) return x def transformer_block(x, mlp, attn, ln_1, ln_2, n_head): x = x + mha(layer_norm(x, * * ln_1), * * attn, n_head = n_head) x = x + ffn(layer_norm(x, * * ln_2), * * mlp) return x def gpt2(inputs, wte, wpe, blocks, ln_f, n_head): x = wte[inputs] + wpe[ range ( len (inputs))] for block in blocks: x = transformer_block(x, * * block, n_head = n_head) return layer_norm(x, * * ln_f) @ wte.T def generate(inputs, params, n_head, n_tokens_to_generate): from tqdm import tqdm for _ in tqdm( range (n_tokens_to_generate), "generating" ): logits = gpt2(inputs, * * params, n_head = n_head) next_id = np.argmax(logits[ - 1 ]) inputs = np.append(inputs, [next_id]) return list (inputs[ len (inputs) - n_tokens_to_generate :]) def main(prompt: str , n_tokens_to_generate: int = 40 , model_size: str = "124M" , models_dir: str = "models" ): from utils import load_encoder_hparams_and_params encoder, hparams, params = load_encoder_hparams_and_params(model_size, models_dir) input_ids = encoder.encode(prompt) assert len (input_ids) + n_tokens_to_generate < hparams[ "n_ctx" ] output_ids = generate(input_ids, params, hparams[ "n_head" ], n_tokens_to_generate) output_text = encoder.decode(output_ids) return output_text if __name__ = = "__main__" : import fire fire.Fire(main) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· Vue3状态管理终极指南:Pinia保姆级教程