特定transfomer架构的模型的压缩量化处理 - IT评测·应用市场-qidao123.com

import torch
import torch.nn as nn
import os
import time
from modelscope import AutoTokenizer
from transformers import LlamaForCausalLM
import bitsandbytes as bnb
import random
import numpy as np
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
def redirect_output_to_file(file_path):
import sys
sys.stdout = open(file_path, 'w')
output_file = os.path.join(os.getcwd(), 'LLMint8_try_output.txt')
redirect_output_to_file(output_file)
model_dir = '/home/mshw/model/Llama2-Chinese-7b-Chat-ms'
print(model_dir)
if not torch.cuda.is_available():
raise RuntimeError("CUDA 不可用，请检查 GPU 配置")
n_gpus = torch.cuda.device_count()
print(f"Number of GPUs detected: {n_gpus}")
if n_gpus == 0:
raise RuntimeError("没有检测到可用的 GPU")
max_memory = {}
for i in range(n_gpus):
free_mem, total_mem = torch.cuda.mem_get_info(i)
max_mem_gb = max(0, int(free_mem / 1024**3) - 1)
max_memory[i] = f"{max_mem_gb}GB"
print(f"GPU {i}: {max_mem_gb}GB free memory allocated for usage.")
cache_dir = os.getcwd()
offload_folder = os.path.join(cache_dir, 'offload')
os.makedirs(offload_folder, exist_ok=True)
# 加载模型
model = LlamaForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16, device_map='auto')
# 确保模型权重绑定
model.tie_weights()
tokenizer = AutoTokenizer.from_pretrained(model_dir, cache_dir=cache_dir)
def get_sparsity(tensor: torch.Tensor) -> float:
tensor_cpu = tensor.to('cpu')
return 1 - float(tensor_cpu.count_nonzero()) / tensor_cpu.numel()
def get_num_parameters(model: nn.Module, count_nonzero_only=False) -> int:
num_counted_elements = 0
for param in model.parameters():
if count_nonzero_only:
param_cpu = param.detach().to('cpu')
num_counted_elements += param_cpu.count_nonzero()
else:
num_counted_elements += param.numel()
return num_counted_elements
def get_model_size_in_gb(model: nn.Module, data_width=8, count_nonzero_only=False) -> float:
data_width_bytes = data_width // 8
model_size_bytes = get_num_parameters(model, count_nonzero_only) * data_width_bytes
model_size_gb = model_size_bytes / (1024**3)
return model_size_gb
def calculate_memory_access(tensor: torch.Tensor) -> int:
return tensor.numel() * tensor.element_size()
def get_model_memory_access(model: nn.Module) -> int:
total_access = 0
for param in model.parameters():
total_access += calculate_memory_access(param)
return total_access
# 使用8位宽度进行模型大小计算
model_size_gb = get_model_size_in_gb(model, data_width=8)
inputs = "咖啡的作用是什么？"
MiB = 1024 * 1024
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
# 使用适当的方法进行推理
inputs_encoded = tokenizer(inputs, return_tensors='pt').to('cuda')
inputs_encoded.pop("token_type_ids", None) # 移除不需要的键
outputs = model.generate(**inputs_encoded)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
end_time = time.time()
for i in range(n_gpus):
peak_mem = torch.cuda.max_memory_reserved(device=i) / MiB
print(f"GPU {i} 峰值内存: {peak_mem:.2f} MiB")
print(result)
print(f"推理时间: {end_time - start_time:.2f} 秒")
# 恢复标准输出
import sys
sys.stdout.close()
sys.stdout = sys.__stdout__

复制代码