tsx81428 发表于 2024-12-28 13:22:13

LLM之基于llama-index部署当地embedding与GLM-4模子并开端搭建RAG(其他大

前言
日常没空,留着以后写

llama-index简介

官网:https://docs.llamaindex.ai/en/stable/
简介也没空,以后再写
注:先阐明,随着官方的变动,代码也大概变动,各人运行不起来,可以进官网查查资料
加载当地embedding模子

假如没有找到 llama_index.embeddings.huggingface
那么:pip install llama_index-embeddings-huggingface
还不可进入官网,输入huggingface举行搜索
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name=f"{embed_model_path}",device='cuda'

)  加载当地LLM模子

还是那句话,假如以下代码不可,进官网搜索Custom LLM Model
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM

class GLMCustomLLM(CustomLLM):
    context_window: int = 8192# 上下文窗口大小
    num_output: int = 8000# 输出的token数量
    model_name: str = "glm-4-9b-chat"# 模型名称
    tokenizer: object = None# 分词器
    model: object = None# 模型
    dummy_response: str = "My response"

    def __init__(self, pretrained_model_name_or_path):
      super().__init__()

      # GPU方式加载模型
      self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
      self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()

      # CPU方式加载模型
      # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
      # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
      self.model = self.model.float()

    @property
    def metadata(self) -> LLMMetadata:
      """Get LLM metadata."""
      # 得到LLM的元数据
      return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
      )

    # @llm_completion_callback()
    # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
    #   return CompletionResponse(text=self.dummy_response)
    #
    # @llm_completion_callback()
    # def stream_complete(
    #   self, prompt: str, **kwargs: Any
    # ) -> CompletionResponseGen:
    #   response = ""
    #   for token in self.dummy_response:
    #         response += token
    #         yield CompletionResponse(text=response, delta=token)

    @llm_completion_callback()# 回调函数
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
      # 完成函数
      print("完成函数")

      inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda()# GPU方式
      # inputs = self.tokenizer.encode(prompt, return_tensors='pt')# CPU方式
      outputs = self.model.generate(inputs, max_length=self.num_output)
      response = self.tokenizer.decode(outputs)
      return CompletionResponse(text=response)

    @llm_completion_callback()
    def stream_complete(
      self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
      # 流式完成函数
      print("流式完成函数")

      inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda()# GPU方式
      # inputs = self.tokenizer.encode(prompt, return_tensors='pt')# CPU方式
      outputs = self.model.generate(inputs, max_length=self.num_output)
      response = self.tokenizer.decode(outputs)
      for token in response:
            yield CompletionResponse(text=token, delta=token)
基于当地模子搭建简易RAG


from typing import Any

from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings,VectorStoreIndex,SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


class GLMCustomLLM(CustomLLM):
    context_window: int = 8192# 上下文窗口大小
    num_output: int = 8000# 输出的token数量
    model_name: str = "glm-4-9b-chat"# 模型名称
    tokenizer: object = None# 分词器
    model: object = None# 模型
    dummy_response: str = "My response"

    def __init__(self, pretrained_model_name_or_path):
      super().__init__()

      # GPU方式加载模型
      self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True)
      self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda", trust_remote_code=True).eval()

      # CPU方式加载模型
      # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
      # self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cpu", trust_remote_code=True)
      self.model = self.model.float()

    @property
    def metadata(self) -> LLMMetadata:
      """Get LLM metadata."""
      # 得到LLM的元数据
      return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
      )

    # @llm_completion_callback()
    # def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
    #   return CompletionResponse(text=self.dummy_response)
    #
    # @llm_completion_callback()
    # def stream_complete(
    #   self, prompt: str, **kwargs: Any
    # ) -> CompletionResponseGen:
    #   response = ""
    #   for token in self.dummy_response:
    #         response += token
    #         yield CompletionResponse(text=response, delta=token)

    @llm_completion_callback()# 回调函数
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
      # 完成函数
      print("完成函数")

      inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda()# GPU方式
      # inputs = self.tokenizer.encode(prompt, return_tensors='pt')# CPU方式
      outputs = self.model.generate(inputs, max_length=self.num_output)
      response = self.tokenizer.decode(outputs)
      return CompletionResponse(text=response)

    @llm_completion_callback()
    def stream_complete(
      self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
      # 流式完成函数
      print("流式完成函数")

      inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda()# GPU方式
      # inputs = self.tokenizer.encode(prompt, return_tensors='pt')# CPU方式
      outputs = self.model.generate(inputs, max_length=self.num_output)
      response = self.tokenizer.decode(outputs)
      for token in response:
            yield CompletionResponse(text=token, delta=token)


if __name__ == "__main__":


    # 定义你的LLM
    pretrained_model_name_or_path = r'/home/nlp/model/LLM/THUDM/glm-4-9b-chat'
    embed_model_path = '/home/nlp/model/Embedding/BAAI/bge-m3'

    Settings.embed_model = HuggingFaceEmbedding(
      model_name=f"{embed_model_path}",device='cuda'

    )

    Settings.llm = GLMCustomLLM(pretrained_model_name_or_path)

    documents = SimpleDirectoryReader(input_dir="home/xxxx/input").load_data()
    index = VectorStoreIndex.from_documents(
      documents,
    )


    # 查询和打印结果
    query_engine = index.as_query_engine()
    response = query_engine.query("萧炎的表妹是谁?")

    print(response)
ollama 


from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

documents = SimpleDirectoryReader("data").load_data()

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3", request_timeout=360.0)

index = VectorStoreIndex.from_documents(
    documents,
) 欢迎各人点赞或收藏
各人的点赞或收藏可以鼓励作者加快更新哟~
作者也新建了一个学习交换群,欢迎各人加入一起学习成长
通过链接获取二维码
链接:https://pan.baidu.com/s/1ZZZZ-ANJvMFHlanl-tbtew?pwd=9el1 

参加链接:
LlamaIndex中的CustomLLM(当地加载模子)
llamaIndex 基于GPU加载当地embedding模子
 
官网文档
官网_starter_example_loca
官网_usage_custom

免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。
页: [1]
查看完整版本: LLM之基于llama-index部署当地embedding与GLM-4模子并开端搭建RAG(其他大