16GB显卡推理80G大模型
最近看到一篇推文是在不量化、不损失精度的情况下使用一张16G的显卡推理70B的大模型。方案来自于kaggle的一个方案,具体流程为:
1.创建一个空的(例如,没有权重的)模型
2.决定每一层将要去哪里(当有多个设备可用时)
3.在内存中加载其权重的一部分
4.在空模型中加载这些权重
5.将权重移动到设备上进行推理
6.从第3步重复,直到所有的权重都被加载
PyTorch 1.9引入了一种新的设备,称为元设备(meta device)。
这使我们能够创建没有任何数据附加的张量,元设备上的张量只需要一个shape,只要你在元设备上,你就可以创建任意大的张量,而不必担心CPU(或GPU)的RAM够不够。
比如下面的代码,内存不够的话就会崩掉
1 import torch 2 large_tensor = torch.randn(100000, 100000)
这个大张量需要4 * 10**10字节(默认精度是FP32,所以张量的每个元素占用4字节),因此需要40GB的RAM。然而,在元设备上执行相同的操作就可以正常运行:
1 import torch 2 large_tensor = torch.randn(100000, 100000, device="meta")
这个张量没有关联的数据,只有一个形状。你可以直接在元设备上实例化一个模型:
1 large_model = torch.nn.Linear(100000, 100000, device="meta")
但是对于现成的模型来说,这种语法需要你重写所有的建模代码,以便每个模型的子部分都接受并传递一个设备关键字参数。由于这对Transformers库的预训练模型来说不切实际,accelerate库有一个context manager,整合了meta device可以实例化一个空模型。
1 # Load meta model (no memory used) 2 with init_empty_weights(): 3 self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=True) 4 self.model.tie_weights()
这一步很关键,我们知道每个权重的形状,因此我们可以知道一旦我们完全加载预训练的张量,它们将消耗多少内存。因此,我们可以决定如何在CPU和GPU之间分割我们的模型。
除此之外,定义了两个关键的方法,分别是load_layer_to_cpu,负责把 权重从disk挪到CPU,另外一个是move_layer_to_device,负责把权重从cpu挪到显卡。还有一个释放显存的方法clean_memory,负责清空显存。
1 def load_layer_to_cpu(self, layer_name): 2 self.weights_loader.set_state_dict(layer_name, self.device) 3 state_dict = self.weights_loader.get_state_dict(self.device) 4 if "value_head.weight" in state_dict: 5 state_dict = {"lm_head.weight" : state_dict["value_head.weight"]} 6 return state_dict 7 8 def move_layer_to_device(self, state_dict): 9 for param_name, param in state_dict.items(): 10 assert param.dtype != torch.int8, "int8 not supported (need to add fp16_statistics)" 11 set_module_tensor_to_device(self.model, param_name, self.device, value=param, dtype=self.dtype) 12 13 def clean_memory(): 14 gc.collect() 15 ctypes.CDLL("libc.so.6").malloc_trim(0) 16 torch.cuda.empty_cache()
下面展示完整的代码
1 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel 2 from accelerate import init_empty_weights 3 from accelerate.utils.modeling import set_module_tensor_to_device 4 from safetensors.torch import load_file 5 from optimum.bettertransformer import BetterTransformer 6 7 N_BATCHES = 3 8 MAX_LENGTH = 4096 9 10 def clean_memory(): 11 gc.collect() 12 ctypes.CDLL("libc.so.6").malloc_trim(0) 13 torch.cuda.empty_cache() 14 15 16 # Class for sharded llama 17 class ShardedLlama: 18 def __init__(self, checkpoint_path, weights_loader, device="cuda:0", dtype=torch.float16): 19 20 # Save parameters 21 self.checkpoint_path = Path(checkpoint_path) 22 self.weights_loader = weights_loader 23 self.device = device 24 self.dtype = dtype 25 26 # Create model 27 self.config = AutoConfig.from_pretrained(self.checkpoint_path) 28 self.tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) 29 self.tokenizer.pad_token = self.tokenizer.eos_token 30 self.tokenizer.padding_side = "right" 31 self.init_model() 32 self.layer_names = ["model.embed_tokens"] + [f"model.layers.{i}" for i in range(len(self.model.model.layers))] + ["model.norm", "value_head"] 33 34 def init_model(self): 35 36 # Load meta model (no memory used) 37 with init_empty_weights(): 38 self.model = AutoModelForCausalLM.from_config(self.config) 39 self.model.lm_head = torch.nn.Linear(8192, 8, bias=False) # originally 32k 40 self.model.eval() 41 self.model = BetterTransformer.transform(self.model) # enable flash attention 42 self.model.tie_weights() 43 44 self.layers = [self.model.model.embed_tokens] + list(self.model.model.layers) + [self.model.model.norm, self.model.lm_head] 45 46 # Move buffers to device (note that much GPU memory used) 47 for buffer_name, buffer in self.model.named_buffers(): 48 set_module_tensor_to_device(self.model, buffer_name, self.device, value=buffer, dtype=self.dtype) 49 50 def load_layer_to_cpu(self, layer_name): 51 self.weights_loader.set_state_dict(layer_name, self.device) 52 state_dict = self.weights_loader.get_state_dict(self.device) 53 if "value_head.weight" in state_dict: 54 state_dict = {"lm_head.weight" : state_dict["value_head.weight"]} 55 return state_dict 56 57 def move_layer_to_device(self, state_dict): 58 for param_name, param in state_dict.items(): 59 assert param.dtype != torch.int8, "int8 not supported (need to add fp16_statistics)" 60 set_module_tensor_to_device(self.model, param_name, self.device, value=param, dtype=self.dtype) 61 62 def __call__(self, inputs): 63 # inputs = [(prefix, suffix), ...] with prefix.shape[0] = 1 and suffix.shape[0] = 5 64 65 # Reboot the model to make sure buffers are loaded and memory is clean 66 del self.model 67 clean_memory() 68 self.init_model() 69 70 # Send batch to device 71 batch = [(prefix.to(self.device), suffix.to(self.device)) for prefix, suffix in inputs] 72 n_suffixes = len(batch[0][1]) 73 suffix_eos = [(suffix != self.tokenizer.pad_token_id).sum(1) - 1 for _, suffix in inputs] 74 75 # Create attention mask for the largest input, and position ids to use KV cache 76 attention_mask = torch.ones(MAX_LENGTH, MAX_LENGTH) 77 attention_mask = attention_mask.triu(diagonal=1)[None, None, ...] == 0 78 attention_mask = attention_mask.to(self.device) 79 position_ids = torch.arange(MAX_LENGTH, dtype=torch.long, device=self.device)[None, :] 80 81 with ThreadPoolExecutor() as executor, torch.inference_mode(): 82 83 # Load first layer 84 future = executor.submit(self.load_layer_to_cpu, "model.embed_tokens") 85 86 for i, (layer_name, layer) in tqdm(enumerate(zip(self.layer_names, self.layers)), desc=self.device, total=len(self.layers)): 87 88 # Load current layer and prepare next layer 89 state_dict = future.result() 90 if (i + 1) < len(self.layer_names): 91 future = executor.submit(self.load_layer_to_cpu, self.layer_names[i + 1]) 92 self.move_layer_to_device(state_dict) 93 94 # Run layer 95 for j, (prefix, suffix) in enumerate(batch): 96 if layer_name == "model.embed_tokens": 97 batch[j] = (layer(prefix), layer(suffix)) 98 elif layer_name == "model.norm": 99 # Only keep the last token at this point 100 batch[j] = (None, layer(suffix[torch.arange(n_suffixes), suffix_eos[j]][:, None])) 101 elif layer_name == "value_head": 102 batch[j] = layer(suffix)[:, 0].mean(1).detach().cpu().numpy() 103 else: 104 # Run prefix 105 len_p, len_s = prefix.shape[1], suffix.shape[1] 106 new_prefix, (k_cache, v_cache) = layer(prefix, use_cache=True, attention_mask=attention_mask[:, :, -len_p:, -len_p:]) 107 108 # Run suffix 109 pos = position_ids[:, len_p:len_p + len_s].expand(n_suffixes, -1) 110 attn = attention_mask[:, :, -len_s:, -len_p - len_s:].expand(n_suffixes, -1, -1, -1) 111 kv_cache = (k_cache.expand(n_suffixes, -1, -1, -1), v_cache.expand(n_suffixes, -1, -1, -1)) 112 new_suffix = layer(suffix, past_key_value=kv_cache, position_ids=pos, attention_mask=attn)[0] 113 batch[j] = (new_prefix, new_suffix) 114 115 # Remove previous layer from memory (including buffers) 116 layer.to("meta") 117 clean_memory() # proposed by CPMP 118 119 # Get scores 120 return batch 121 122 123 124 125 def run_model(device, df, weights_loader): 126 model = ShardedLlama(checkpoint_path, weights_loader, device=device) 127 f = partial(get_tokens, tokenizer=model.tokenizer) 128 inputs = df.apply(f, axis=1).values 129 batches = np.array_split(inputs, N_BATCHES) 130 outputs = [] 131 for i, batch in enumerate(batches): 132 outputs += model(batch) 133 return outputs
完整代码参考:https://www.kaggle.com/code/simjeg/platypus2-70b-without-wikipedia-rag
文章来源:
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律