马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。
您需要 登录 才可以下载或查看,没有账号?立即注册
x
前言
日常没空,留着以后写
llama-index简介
官网:https://docs.llamaindex.ai/en/stable/
简介也没空,以后再写
注:先阐明,随着官方的变动,代码也大概变动,各人运行不起来,可以进官网查查资料
加载当地embedding模子
假如没有找到 llama_index.embeddings.huggingface
那么:pip install llama_index-embeddings-huggingface
还不可进入官网,输入huggingface举行搜索
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
- from llama_index.core import Settings
- Settings.embed_model = HuggingFaceEmbedding(
- model_name=f"{embed_model_path}",device='cuda'
- )
复制代码 加载当地LLM模子
还是那句话,假如以下代码不可,进官网搜索Custom LLM Model
- from llama_index.core.llms import (
- CustomLLM,
- CompletionResponse,
- CompletionResponseGen,
- LLMMetadata,
- )
- from llama_index.core.llms.callbacks import llm_completion_callback
- from transformers import AutoTokenizer, AutoModelForCausalLM
- class GLMCustomLLM(CustomLLM):
- context_window: int = 8192 # 上下文窗口大小
- num_output: int = 8000 # 输出的token数量
- model_name: str = "glm-4-9b-chat" # 模型名称
- tokenizer: object = None # 分词器
- model: object = None # 模型
- dummy_response: str = "My response"
- def __init__(self, pretrained_model_name_or_path):
- super().__init__()
- # GPU方式加载模型
- self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
- self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
- # CPU方式加载模型
- # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- self.model = self.model.float()
- @property
- def metadata(self) -> LLMMetadata:
- """Get LLM metadata."""
- # 得到LLM的元数据
- return LLMMetadata(
- context_window=self.context_window,
- num_output=self.num_output,
- model_name=self.model_name,
- )
- # @llm_completion_callback()
- # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # return CompletionResponse(text=self.dummy_response)
- #
- # @llm_completion_callback()
- # def stream_complete(
- # self, prompt: str, **kwargs: Any
- # ) -> CompletionResponseGen:
- # response = ""
- # for token in self.dummy_response:
- # response += token
- # yield CompletionResponse(text=response, delta=token)
- @llm_completion_callback() # 回调函数
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # 完成函数
- print("完成函数")
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- return CompletionResponse(text=response)
- @llm_completion_callback()
- def stream_complete(
- self, prompt: str, **kwargs: Any
- ) -> CompletionResponseGen:
- # 流式完成函数
- print("流式完成函数")
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- for token in response:
- yield CompletionResponse(text=token, delta=token)
复制代码
基于当地模子搭建简易RAG
- from typing import Any
- from llama_index.core.llms import (
- CustomLLM,
- CompletionResponse,
- CompletionResponseGen,
- LLMMetadata,
- )
- from llama_index.core.llms.callbacks import llm_completion_callback
- from transformers import AutoTokenizer, AutoModelForCausalLM
- from llama_index.core import Settings,VectorStoreIndex,SimpleDirectoryReader
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
- class GLMCustomLLM(CustomLLM):
- context_window: int = 8192 # 上下文窗口大小
- num_output: int = 8000 # 输出的token数量
- model_name: str = "glm-4-9b-chat" # 模型名称
- tokenizer: object = None # 分词器
- model: object = None # 模型
- dummy_response: str = "My response"
- def __init__(self, pretrained_model_name_or_path):
- super().__init__()
- # GPU方式加载模型
- self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
- self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()
- # CPU方式加载模型
- # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
- self.model = self.model.float()
- @property
- def metadata(self) -> LLMMetadata:
- """Get LLM metadata."""
- # 得到LLM的元数据
- return LLMMetadata(
- context_window=self.context_window,
- num_output=self.num_output,
- model_name=self.model_name,
- )
- # @llm_completion_callback()
- # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # return CompletionResponse(text=self.dummy_response)
- #
- # @llm_completion_callback()
- # def stream_complete(
- # self, prompt: str, **kwargs: Any
- # ) -> CompletionResponseGen:
- # response = ""
- # for token in self.dummy_response:
- # response += token
- # yield CompletionResponse(text=response, delta=token)
- @llm_completion_callback() # 回调函数
- def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
- # 完成函数
- print("完成函数")
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- return CompletionResponse(text=response)
- @llm_completion_callback()
- def stream_complete(
- self, prompt: str, **kwargs: Any
- ) -> CompletionResponseGen:
- # 流式完成函数
- print("流式完成函数")
- inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
- # inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
- outputs = self.model.generate(inputs, max_length=self.num_output)
- response = self.tokenizer.decode(outputs[0])
- for token in response:
- yield CompletionResponse(text=token, delta=token)
- if __name__ == "__main__":
- # 定义你的LLM
- pretrained_model_name_or_path = r'/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
- embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'
- Settings.embed_model = HuggingFaceEmbedding(
- model_name=f"{embed_model_path}",device='cuda'
- )
- Settings.llm = GLMCustomLLM(pretrained_model_name_or_path)
- documents = SimpleDirectoryReader(input_dir="home/xxxx/input").load_data()
- index = VectorStoreIndex.from_documents(
- documents,
- )
- # 查询和打印结果
- query_engine = index.as_query_engine()
- response = query_engine.query("萧炎的表妹是谁?")
- print(response)
复制代码 ollama
- from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
- from llama_index.llms.ollama import Ollama
- documents = SimpleDirectoryReader("data").load_data()
- # bge-base embedding model
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
- # ollama
- Settings.llm = Ollama(model="llama3", request_timeout=360.0)
- index = VectorStoreIndex.from_documents(
- documents,
- )
复制代码 欢迎各人点赞或收藏
各人的点赞或收藏可以鼓励作者加快更新哟~
作者也新建了一个学习交换群,欢迎各人加入一起学习成长
通过链接获取二维码
链接:https://pan.baidu.com/s/1ZZZZ-ANJvMFHlanl-tbtew?pwd=9el1
参加链接:
LlamaIndex中的CustomLLM(当地加载模子)
llamaIndex 基于GPU加载当地embedding模子
官网文档
官网_starter_example_loca
官网_usage_custom
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |