MiniCPM3-4B是由面壁智能与清华大学天然语言处理实验室互助开发的一款高性能端侧AI模型,它是MiniCPM系列的第三代产物,具有4亿参数量。
MiniCPM3-4B模型在性能上超过了Phi-3.5-mini-Instruct和GPT-3.5-Turbo-0125,而且与多款70亿至90亿参数的AI模型相媲美。
MiniCPM3-4B在多项指标上都有显著提升,包括词汇表大小、模型层数和隐蔽层节点的增加,使其处理本领更为精彩。
MiniCPM3-4B支持32k的上下文窗口计划,理论上可以处理无穷的上下文信息,这对于须要处理大量数据和复杂查询的用户来说是一个巨大的优势。
MiniCPM3-4B还支持更高效的代码实行和函数调用,使开发者能够更快速地实现复杂的任务。
此外,面壁智能还发布了针对RAG场景的微调版MiniCPM3-RAG-LoRA模型,以及RAG套件MiniCPM-Embedding模型和MiniCPM-Reranker模型。
github项目地点:https://github.com/OpenBMB/MiniCPM。
一、环境安装
1、python环境
建议安装python版本在3.10以上。
2、pip库安装
pip install torch==2.3.0+cu118 torchvision==0.18.0+cu118 torchaudio==2.3.0 --extra-index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install datamodel_code_generator -i https://pypi.tuna.tsinghua.edu.cn/simple
3、MiniCPM3-4B模型下载:
git lfs install
git clone https://modelscope.cn/models/OpenBMB/MiniCPM3-4B 4、MiniCPM3-RAG-LoRA模型下载:
git lfs install
git clone https://modelscope.cn/models/OpenBMB/MiniCPM3-RAG-LoRA 5、MiniCPM-Reranker模型下载:
git lfs install
git clone https://modelscope.cn/models/OpenBMB/MiniCPM-Reranker 6、MiniCPM-Embedding模型下载:
git lfs install
git clone https://modelscope.cn/models/OpenBMB/MiniCPM-Embedding
二、功能测试
1、运行测试:
(1)python代码调用测试
- import torch
- from modelscope import AutoModelForCausalLM, AutoModel, AutoTokenizer, snapshot_download
- from transformers import AutoModelForSequenceClassification
- from peft import PeftModel
- import torch.nn.functional as F
- import numpy as np
- def MiniCPM3_4B_inference(message, model_path="OpenBMB/MiniCPM3-4B", device="cuda"):
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
- model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
- messages = [{"role": "user", "content": message}]
- model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
- model_outputs = model.generate(
- model_inputs,
- max_new_tokens=1024,
- top_p=0.7,
- temperature=0.7,
- repetition_penalty=1.02
- )
- output_token_ids = [model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))]
- responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
- return responses
- def MiniCPM3_RAG_LoRA_inference(instruction, passages_list, base_model_dir="OpenBMB/MiniCPM3-4B", lora_model_dir="OpenBMB/MiniCPM3-RAG-LoRA"):
- base_model_dir = snapshot_download(base_model_dir)
- lora_model_dir = snapshot_download(lora_model_dir)
- model = AutoModelForCausalLM.from_pretrained(base_model_dir, device_map="auto", torch_dtype=torch.bfloat16).eval()
- tokenizer = AutoTokenizer.from_pretrained(lora_model_dir)
- model = PeftModel.from_pretrained(model, lora_model_dir)
- passages = '\n'.join(passages_list)
- input_text = 'Background:\n' + passages + '\n\n' + instruction
- messages = [
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": input_text},
- ]
- prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
- outputs = model.chat(tokenizer, prompt, temperature=0.8, top_p=0.8)
- return outputs[0]
- def MiniCPM_Embedding_inference(queries, passages, model_name="OpenBMB/MiniCPM-Embedding", device="cuda"):
- tokenizer = AutoTokenizer.from_pretrained(model_name)
- model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to(device)
- model.eval()
- def weighted_mean_pooling(hidden, attention_mask):
- attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
- s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
- d = attention_mask_.sum(dim=1, keepdim=True).float()
- reps = s / d
- return reps
- @torch.no_grad()
- def encode(input_texts):
- batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to(device)
- outputs = model(**batch_dict)
- attention_mask = batch_dict["attention_mask"]
- hidden = outputs.last_hidden_state
- reps = weighted_mean_pooling(hidden, attention_mask)
- embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
- return embeddings
- INSTRUCTION = "Query: "
- queries = [INSTRUCTION + query for query in queries]
- embeddings_query = encode(queries)
- embeddings_doc = encode(passages)
- scores = (embeddings_query @ embeddings_doc.T)
- return scores.tolist()
- def MiniCPM_Reranker_rerank(queries, passages, model_name='OpenBMB/MiniCPM-Reranker', device="cuda", max_len_q=512, max_len_d=512):
- model_name = snapshot_download(model_name)
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- tokenizer.padding_side = "right"
- model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to(device)
- model.eval()
- def tokenize_our(query, doc):
- input_id_query = tokenizer.encode(query, add_special_tokens=False, max_length=max_len_q, truncation=True)
- input_id_doc = tokenizer.encode(doc, add_special_tokens=False, max_length=max_len_d, truncation=True)
- pad_input = {"input_ids": [tokenizer.bos_token_id] + input_id_query + [tokenizer.eos_token_id] + input_id_doc}
- return tokenizer.pad(
- pad_input,
- padding="max_length",
- max_length=max_len_q + max_len_d + 2,
- return_tensors="pt",
- )
- @torch.no_grad()
- def rerank(input_query, input_docs):
- tokenized_inputs = [tokenize_our(input_query, input_doc).to(device) for input_doc in input_docs]
- input_ids = {
- "input_ids": [tokenized_input["input_ids"] for tokenized_input in tokenized_inputs],
- "attention_mask": [tokenized_input["attention_mask"] for tokenized_input in tokenized_inputs]
- }
- for k in input_ids:
- input_ids[k] = torch.stack(input_ids[k]).to(device)
- outputs = model(**input_ids)
- score = outputs.logits
- return score.float().detach().cpu().numpy()
- INSTRUCTION = "Query: "
- queries = [INSTRUCTION + query for query in queries]
- scores = [rerank(query, docs) for query, docs in zip(queries, passages)]
- return np.array(scores)
- def main():
- # Example use cases
- response_4B = MiniCPM3_4B_inference("推荐5个北京的景点。")
- print(f"MiniCPM3-4B Response: {response_4B}")
- instruction = "Q: What is the name of the lead character in the novel 'The Silent Watcher'?\nA:"
- passages_list = [
- "In the novel 'The Silent Watcher,' the lead character is named Alex Carter. Alex is a private detective who uncovers a series of mysterious events in a small town.",
- "Set in a quiet town, 'The Silent Watcher' follows Alex Carter, a former police officer turned private investigator, as he unravels the town's dark secrets.",
- "'The Silent Watcher' revolves around Alex Carter's journey as he confronts his past while solving complex cases in his hometown."
- ]
- response_RAG_LoRA = MiniCPM3_RAG_LoRA_inference(instruction, passages_list)
- print(f"MiniCPM3-RAG-LoRA Response: {response_RAG_LoRA}")
- queries = ["China capital?"]
- passages = ["beijing", "shanghai"]
- scores_embedding = MiniCPM_Embedding_inference(queries, passages)
- print(f"MiniCPM-Embedding Scores: {scores_embedding}")
- rerank_queries = ["China capital?"]
- rerank_passages = [["beijing", "shanghai"]]
- scores_reranker = MiniCPM_Reranker_rerank(rerank_queries, rerank_passages)
- print(f"MiniCPM-Reranker Scores: {scores_reranker}")
- if __name__ == "__main__":
- main()
复制代码 未完......
更多具体的接待关注:杰哥新技能
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |