已在notebook测试无误。
包安装
pip install langchain langchain_community transformers InstructorEmbedding sentence_transformers==2.2.2 faiss-gpu PyPDF2 streamlit pyngrok gradio fitz frontend
情况变量设置
huggingface连不上无法下载模子,需要设置镜像。
import os
# 设置情况变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 检查情况变量是否已更新
print(os.environ['HF_ENDPOINT'])
模子下载
!huggingface-cli download --resume-download BAAI/bge-m3 --token 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
!huggingface-cli download --resume-download baichuan-inc/Baichuan2-7B-Chat --token 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
重要代码
- # coding: utf-8
- # Author: 唐国梁Tommy
- # Date: 2023-08-06
- import streamlit as st
- from langchain_community.embeddings import HuggingFaceInstructEmbeddings
- from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
- from langchain.text_splitter import CharacterTextSplitter
- from langchain_community.vectorstores import FAISS, Milvus, Pinecone, Chroma
- from langchain.memory import ConversationBufferMemory
- from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline
- from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
- import streamlit as st
- from PyPDF2 import PdfReader
- def main():
- # 配置界面
- st.set_page_config(page_title="基于PDF文档的 QA ChatBot",
- page_icon=":robot:")
- st.header("基于LangChain+LLM实现QA ChatBot")
- # 参考官网链接:https://github.com/hwchase17/langchain-streamlit-template/blob/master/main.py
- # 初始化
- # session_state是Streamlit提供的用于存储会话状态的功能
- if "conversation" not in st.session_state:
- st.session_state.conversation = None
- if "chat_history" not in st.session_state:
- st.session_state.chat_history = None
- # 1. 提供用户输入文本框
- user_input = st.text_input("基于上传的PDF文档,请输入你的提问: ")
- # 处理用户输入,并返回响应结果
- if user_input:
- process_user_input(user_input)
- with st.sidebar:
- # 2. 设置子标题
- st.subheader("你的PDF文档")
- # 3. 上传文档
- files = st.file_uploader("上传PDF文档,然后点击'提交并处理'",
- accept_multiple_files=True)
- if st.button("提交并处理"):
- with st.spinner("请等待,处理中..."):
- # 4. 获取PDF文档内容(文本)
- texts = extract_text_from_PDF(files)
- # 5. 将获取到的文档内容进行切分
- content_chunks = split_content_into_chunks(texts)
- # st.write(content_chunks)
- # 6. 对每个chunk计算embedding,并存入到向量数据库
- # 6.1 根据model_type和model_name创建embedding model对象
- #embedding_model = get_openaiEmbedding_model()
- # embedding_model = get_huggingfaceEmbedding_model(model_name="BAAI/bge-m3")
- embedding_model = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-m3")
- # 6.2 创建向量数据库对象,并将文本embedding后存入到里面
- vector_store = save_chunks_into_vectorstore(content_chunks, embedding_model)
- # 7. 创建对话chain
- # 官网链接:https://python.langchain.com/docs/modules/memory/types/buffer
- st.session_state.conversation = get_chat_chain(vector_store)
- def extract_text_from_PDF(files):
- # 参考官网链接:https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
- # 加载多个PDF文件
- text = ""
- for pdf in files:
- pdf_reader = PdfReader(pdf)
- for page in pdf_reader.pages:
- text += page.extract_text()
- return text
- def split_content_into_chunks(text):
- # 参考官网链接:https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/character_text_splitter
- text_spliter = CharacterTextSplitter(separator="\n",
- chunk_size=500,
- chunk_overlap=80,
- length_function=len)
- chunks = text_spliter.split_text(text)
- return chunks
- def save_chunks_into_vectorstore(content_chunks, embedding_model):
- # 参考官网链接:https://python.langchain.com/docs/modules/data_connection/vectorstores/
- # ① FAISS
- # pip install faiss-gpu (如果没有GPU,那么 pip install faiss-cpu)
- vectorstore = FAISS.from_texts(texts=content_chunks,
- embedding=embedding_model)
-
- return vectorstore
- def get_chat_chain(vector_store):
- # ① 获取 LLM model
- #llm = get_openai_model()
- # llm = get_huggingfacehub(model_name="google/flan-t5-xxl")
- # llm = get_huggingfacehub(model_name="google-bert/bert-base-chinese")
- model_path = "baichuan-inc/Baichuan2-7B-Chat"
- model = AutoModelForCausalLM.from_pretrained(model_path,trust_remote_code=True)
- tokenizer = AutoTokenizer.from_pretrained(model_path)
- print(type(model))
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
- print(type(pipe))
- llm = HuggingFacePipeline(pipeline=pipe)
- print(type(llm))
- # ② 存储历史记录
- # 参考官网链接:https://python.langchain.com/docs/use_cases/question_answering/how_to/chat_vector_db
- # 用于缓存或者保存对话历史记录的对象
- memory = ConversationBufferMemory(
- memory_key='chat_history', return_messages=True)
- # ③ 对话链
- conversation_chain = ConversationalRetrievalChain.from_llm(
- llm=llm,
- retriever=vector_store.as_retriever(
- search_type="similarity",
- search_kwargs={"k": 5}
- ),
- memory=memory
- )
- return conversation_chain
- def process_user_input(user_input):
- print('输入内容 '+user_input)
- if st.session_state.conversation is not None:
- print('不为空')
- # 调用函数st.session_state.conversation,并把用户输入的内容作为一个问题传入,返回响应。
- response = st.session_state.conversation({'question': user_input})
- print('response '+response)
- # session状态是Streamlit中的一个特性,允许在用户的多个请求之间保存数据。
- st.session_state.chat_history = response['chat_history']
- # 显示聊天记录
- # chat_history : 一个包含之前聊天记录的列表
- for i, message in enumerate(st.session_state.chat_history):
- # 用户输入
- if i % 2 == 0:
- st.write(user_template.replace(
- "{{MSG}}", message.content), unsafe_allow_html=True) # unsafe_allow_html=True表示允许HTML内容被渲染
- else:
- # 机器人响应
- st.write(bot_template.replace(
- "{{MSG}}", message.content), unsafe_allow_html=True)
- if __name__ == "__main__":
- main()
复制代码 内网穿透
from pyngrok import ngrok
ngrok.set_auth_token("2gttEpg9QW0le1sGKV10G0oLZ7j_4EGYjKQAgHErHb3Qk13q9")
# 利用 ngrok 将当地的 Gradio 服务器端口转发到公共 URL
public_url = ngrok.connect(addr="8501", proto="http")
tunnels = ngrok.get_tunnels()
#ngrok.disconnect(public_url)
print("tunnels:", tunnels)
# 输出公共 URL
#print("ublic URL:", public_url)
测试运行
!streamlit run /mnt/workspace/main.py
requirement.txt
Package Version
------------------------------ --------------------
absl-py 2.0.0
accelerate 0.29.3
adaseq 0.6.6
addict 2.4.0
aiofiles 23.2.1
aiohttp 3.9.5
aiosignal 1.3.1
alabaster 0.7.13
albumentations 1.3.1
alias-free-torch 0.0.6
aliyun-python-sdk-core 2.15.1
aliyun-python-sdk-kms 2.16.2
altair 5.3.0
aniso8601 9.0.1
annotated-types 0.6.0
antlr4-python3-runtime 4.9.3
anyio 4.3.0
appdirs 1.4.4
argon2-cffi 23.1.0
argon2-cffi-bindings 21.2.0
arrow 1.3.0
asttokens 2.4.1
astunparse 1.6.3
async-lru 2.0.4
async-timeout 4.0.3
attrs 23.2.0
audioread 3.0.1
autoawq 0.2.4
autoawq_kernels 0.0.6
autojump 0.1.0
autopep8 2.0.4
av 12.0.0
Babel 2.14.0
beartype 0.18.5
beautifulsoup4 4.12.3
bidict 0.23.1
biopython 1.83
bitarray 2.9.2
bitstring 4.2.0
black 24.4.0
bleach 6.1.0
blinker 1.8.2
blis 0.7.11
blobfile 2.1.1
bmt-clipit 1.0
boltons 23.0.0
boto3 1.34.88
botocore 1.34.88
brotlipy 0.7.0
cachetools 5.3.2
catalogue 2.0.10
certifi 2023.11.17
cffi 1.15.1
cfgv 3.4.0
charset-normalizer 2.0.4
chumpy 0.70
ci-info 0.3.0
cityscapesScripts 2.2.3
click 8.1.7
clip 1.0
cloudpathlib 0.16.0
cloudpickle 3.0.0
colorama 0.4.6
coloredlogs 14.0
comm 0.2.1
conda 23.9.0
conda-content-trust 0.2.0
conda-libmamba-solver 23.9.1
conda-package-handling 2.2.0
conda_package_streaming 0.9.0
confection 0.1.4
ConfigArgParse 1.7
configobj 5.0.8
configparser 7.0.0
contextlib2 21.6.0
contourpy 1.2.0
control-ldm 0.0.1
crcmod 1.7
cryptography 41.0.3
cycler 0.12.1
cymem 2.0.8
Cython 0.29.36
dacite 1.8.1
dataclasses 0.6
dataclasses-json 0.6.6
datasets 2.18.0
ddpm-guided-diffusion 0.0.0
debugpy 1.8.0
decorator 4.4.2
decord 0.6.0
deepspeed 0.12.6
defusedxml 0.7.1
descartes 1.1.0
dgl 1.1.3
dglgo 0.0.2
diffusers 0.27.2
dill 0.3.8
Distance 0.1.3
distlib 0.3.8
dnspython 2.3.0
docstring_parser 0.16
docutils 0.20.1
easydict 1.13
easyrobust 0.2.4
edit-distance 1.0.6
editdistance 0.5.2
einops 0.7.0
embeddings 0.0.8
emoji 2.11.1
espnet-tts-frontend 0.0.3
et-xmlfile 1.1.0
etelemetry 0.3.1
eventlet 0.36.1
exceptiongroup 1.2.0
executing 2.0.1
expecttest 0.2.1
face-alignment 1.4.1
fairscale 0.4.13
fairseq 0.12.2
faiss-gpu 1.7.2
fastai 2.7.14
fastapi 0.110.2
fastcore 1.5.29
fastdownload 0.0.7
fastjsonschema 2.19.1
fastprogress 1.0.3
fasttext 0.9.2
ffmpeg 1.4
ffmpeg-python 0.2.0
ffmpy 0.3.2
filelock 3.13.1
fire 0.6.0
fitz 0.0.1.dev2
flake8 7.0.0
Flask 2.2.5
Flask-Cors 4.0.0
Flask-RESTful 0.3.10
Flask-SocketIO 5.3.6
flask-talisman 1.1.0
flatbuffers 23.5.26
fonttools 4.47.0
fqdn 1.5.1
frontend 0.0.3
frozenlist 1.4.1
fsspec 2023.12.2
ftfy 6.2.0
funasr 1.0.14
funcodec 0.2.0
funtextprocessing 0.1.1
future 1.0.0
fvcore 0.1.5.post20221221
g2p 2.0.0
g2p-en 2.1.0
gast 0.5.4
gitdb 4.0.11
GitPython 3.1.43
google-auth 2.26.1
google-auth-oauthlib 1.0.0
google-pasta 0.2.0
gradio 4.32.2
gradio_client 0.17.0
greenlet 3.0.3
grpcio 1.60.0
h11 0.14.0
h5py 3.10.0
hdbscan 0.8.33
hjson 3.1.0
httpcore 1.0.5
httplib2 0.22.0
httpx 0.27.0
huggingface-hub 0.22.2
humanfriendly 10.0
hydra-core 1.3.2
HyperPyYAML 1.2.2
identify 2.5.36
idna 3.4
imageio 2.34.1
imageio-ffmpeg 0.4.9
imagesize 1.4.1
imgaug 0.4.0
importlib-metadata 7.0.1
importlib_resources 6.4.0
inflect 7.0.0
iniconfig 2.0.0
InstructorEmbedding 1.0.1
iopath 0.1.10
ipdb 0.13.13
ipykernel 6.28.0
ipython 8.19.0
isodate 0.6.1
isoduration 20.11.0
isort 5.13.2
itsdangerous 2.2.0
jaconv 0.3.4
jamo 0.4.1
jedi 0.19.1
jieba 0.42.1
Jinja2 3.1.2
jmespath 0.10.0
joblib 1.3.2
json-tricks 3.17.3
json5 0.9.25
jsonpatch 1.33
jsonplus 0.8.0
jsonpointer 2.1
jsonschema 4.21.1
jsonschema-specifications 2023.12.1
jupyter_client 8.6.0
jupyter_core 5.7.0
jupyter-events 0.10.0
jupyter-lsp 2.2.5
jupyter_server 2.14.0
jupyter_server_terminals 0.5.3
jupyterlab 4.1.6
jupyterlab-language-pack-zh-CN 4.1.post2
jupyterlab_pygments 0.3.0
jupyterlab_server 2.27.1
kaldiio 2.18.0
kantts 1.0.1
keras 2.14.0
kiwisolver 1.4.5
kornia 0.7.2
kornia_rs 0.1.3
kwsbp 0.0.6
langchain 0.2.1
langchain-community 0.2.1
langchain-core 0.2.3
langchain-text-splitters 0.2.0
langcodes 3.3.0
langsmith 0.1.67
lap 0.4.0
lazy_loader 0.4
libclang 16.0.6
libmambapy 1.5.1
librosa 0.10.1
lightning-utilities 0.11.2
littleutils 0.2.2
llvmlite 0.41.1
lmdb 1.4.1
local-attention 1.9.1
looseversion 1.3.0
lpips 0.1.4
lxml 4.9.4
lyft-dataset-sdk 0.0.8
Markdown 3.5.1
markdown-it-py 3.0.0
MarkupSafe 2.1.3
marshmallow 3.21.2
matplotlib 3.5.3
matplotlib-inline 0.1.6
mccabe 0.7.0
mdurl 0.1.2
megatron-util 1.3.2
MinDAEC 0.0.2
mir-eval 0.7
mistune 3.0.2
ml-collections 0.1.1
ml-dtypes 0.2.0
mmcls 0.25.0
mmcv-full 1.7.0+torch2.1cpu
mmdet 2.28.2
mmdet3d 1.0.0a1
mmsegmentation 0.30.0
mock 5.1.0
modelscope 1.14.0
modelscope_kit 0.3.0
more-itertools 10.2.0
moviepy 1.0.3
mpi4py 3.1.5
mpmath 1.3.0
ms-swift 2.0.2
msgpack 1.0.8
multidict 6.0.5
multiprocess 0.70.16
munkres 1.1.4
murmurhash 1.0.10
mypy-extensions 1.0.0
nbclient 0.10.0
nbconvert 7.16.3
nbformat 5.10.4
nerfacc 0.2.2
nest-asyncio 1.5.8
networkx 3.2.1
nibabel 5.2.1
ninja 1.11.1.1
nipype 1.8.6
nltk 3.8.1
nodeenv 1.8.0
notebook_shim 0.2.4
numba 0.58.1
numpy 1.26.3
numpydoc 1.6.0
nuscenes-devkit 1.1.11
oauthlib 3.2.2
ogb 1.3.6
omegaconf 2.3.0
onnx 1.16.0
onnxruntime 1.17.3
onnxsim 0.4.36
open-clip-torch 2.24.0
openai-whisper 20231117
opencv-python 4.9.0.80
opencv-python-headless 4.9.0.80
openpyxl 3.1.2
opt-einsum 3.3.0
optimum 1.19.0
orjson 3.10.3
oss2 2.18.4
outdated 0.2.2
overrides 7.7.0
packaging 23.2
pai-easycv 0.11.6
paint-ldm 0.0.0
pandas 2.1.4
pandocfilters 1.5.1
panopticapi 0.1
panphon 0.20.0
parso 0.8.3
pathlib 1.0.1
pathspec 0.12.1
peft 0.10.0
pexpect 4.9.0
phaseaug 1.0.1
pickleshare 0.7.5
pillow 10.2.0
pip 24.0
platformdirs 4.1.0
plotly 5.21.0
pluggy 1.5.0
plyfile 1.0.3
pooch 1.8.0
portalocker 2.8.2
pre-commit 3.7.0
preshed 3.0.9
prettytable 3.10.0
proglog 0.1.10
prometheus_client 0.20.0
prompt-toolkit 3.0.43
protobuf 3.20.3
prov 2.0.0
psutil 5.9.7
ptflops 0.7.2.2
ptyprocess 0.7.0
pure-eval 0.2.2
py-cpuinfo 9.0.0
py-sound-connect 0.2.1
pyarrow 16.0.0
pyarrow-hotfix 0.6
pyasn1 0.5.1
pyasn1-modules 0.3.0
pybind11 2.11.1
pyclipper 1.3.0.post5
pycocoevalcap 1.2
pycocotools 2.0.7
pycodestyle 2.11.1
pycosat 0.6.6
pycparser 2.21
pycryptodome 3.20.0
pycryptodomex 3.20.0
pydantic 2.5.3
pydantic_core 2.14.6
pydeck 0.9.1
pyDeprecate 0.3.2
pydot 2.0.0
pydub 0.25.1
pyflakes 3.2.0
Pygments 2.17.2
PyMCubes 0.1.4
pyngrok 7.1.6
pynini 2.1.5
pynndescent 0.5.12
pynvml 11.5.0
pyOpenSSL 23.2.0
pyparsing 3.1.1
PyPDF2 3.0.1
pypinyin 0.44.0
pyquaternion 0.9.9
PySocks 1.7.1
pysptk 0.1.18
pytest 8.1.1
pythainlp 5.0.2
python-crfsuite 0.9.10
python-dateutil 2.8.2
python-engineio 4.9.0
python-json-logger 2.0.7
python-multipart 0.0.9
python-socketio 5.11.2
pytorch-lightning 1.7.7
pytorch-metric-learning 2.5.0
pytorch-wavelets 1.3.0
pytorch-wpe 0.0.1
pytz 2023.3.post1
pyvi 0.1.1
PyWavelets 1.6.0
pyxnat 1.6.2
PyYAML 6.0.1
pyzmq 25.1.2
qudida 0.0.4
rapidfuzz 3.8.1
rdflib 7.0.0
rdkit-pypi 2022.9.5
referencing 0.35.0
regex 2023.12.25
requests 2.31.0
requests-oauthlib 1.3.1
resampy 0.4.2
rfc3339-validator 0.1.4
rfc3986-validator 0.1.1
rich 13.7.1
rotary-embedding-torch 0.5.3
rouge 1.0.1
rouge-score 0.0.4
rpds-py 0.18.0
rsa 4.9
ruamel.yaml 0.18.6
ruamel.yaml.clib 0.2.8
ruff 0.4.7
s3transfer 0.10.1
sacrebleu 2.4.0
sacremoses 0.1.1
safetensors 0.4.1
scikit-image 0.19.3
scikit-learn 1.3.2
scipy 1.11.4
seaborn 0.13.2
semantic-version 2.10.0
Send2Trash 1.8.3
sentence-transformers 2.2.2
sentencepiece 0.2.0
seqeval 1.2.2
setuptools 68.0.0
Shapely 1.8.4
shellingham 1.5.4
shotdetect-scenedetect-lgss 0.0.4
shtab 1.7.1
simple-websocket 1.0.0
simplejson 3.19.2
six 1.16.0
sklearn-crfsuite 0.3.6
smart-open 6.4.0
smmap 5.0.1
smplx 0.1.28
sniffio 1.3.1
snowballstemmer 2.2.0
sortedcontainers 2.4.0
soundfile 0.12.1
soupsieve 2.5
sox 1.5.0
soxr 0.3.7
spacy 3.7.4
spacy-legacy 3.0.12
spacy-loggers 1.0.5
speechbrain 1.0.0
Sphinx 7.2.6
sphinxcontrib-applehelp 1.0.7
sphinxcontrib-devhelp 1.0.5
sphinxcontrib-htmlhelp 2.0.4
sphinxcontrib-jsmath 1.0.1
sphinxcontrib-qthelp 1.0.6
sphinxcontrib-serializinghtml 1.1.9
SQLAlchemy 2.0.30
srsly 2.4.8
sse-starlette 2.1.0
stack-data 0.6.3
stanza 1.8.2
starlette 0.37.2
streamlit 1.35.0
subword-nmt 0.3.8
sympy 1.12
tabulate 0.9.0
taming-transformers-rom1504 0.0.6
tenacity 8.2.3
tensorboard 2.16.2
tensorboard-data-server 0.7.2
tensorboardX 2.6.2.2
tensorflow 2.14.0
tensorflow-estimator 2.14.0
tensorflow-io-gcs-filesystem 0.35.0
termcolor 2.4.0
terminado 0.18.1
terminaltables 3.1.10
text-unidecode 1.3
text2sql-lgesql 1.3.0
tf-slim 1.1.0
thinc 8.2.3
thop 0.1.1.post2209072238
threadpoolctl 3.2.0
tifffile 2024.4.18
tiktoken 0.6.0
timm 0.9.16
tinycss2 1.3.0
tokenizers 0.15.2
toml 0.10.2
tomli 2.0.1
tomlkit 0.12.0
toolz 0.12.1
torch 2.1.2+cpu
torch-complex 0.4.3
torch-scatter 2.1.2
torchaudio 2.1.2+cpu
torchmetrics 1.3.2
torchsummary 1.5.1
torchvision 0.16.2+cpu
tornado 6.4
tqdm 4.65.0
traitlets 5.14.1
traits 6.3.2
transformers 4.38.2
transformers-stream-generator 0.0.5
trimesh 2.35.39
triton 2.3.0
trl 0.8.5
truststore 0.8.0
ttsfrd 0.2.1
typeguard 2.13.3
typer 0.12.3
types-python-dateutil 2.9.0.20240316
typing 3.7.4.3
typing_extensions 4.9.0
typing-inspect 0.9.0
tyro 0.8.3
tzdata 2023.4
ujson 5.9.0
umap-learn 0.5.6
unicodecsv 0.14.1
unicodedata2 15.1.0
Unidecode 1.3.8
uri-template 1.3.0
urllib3 2.2.1
utils 1.0.2
uvicorn 0.29.0
videofeatures-clipit 1.0
virtualenv 20.25.3
wasabi 1.1.2
watchdog 4.0.1
wcwidth 0.2.12
weasel 0.3.4
webcolors 1.13
webencodings 0.5.1
websocket-client 1.8.0
websockets 11.0.3
Werkzeug 3.0.1
wget 3.2
wheel 0.41.2
wrapt 1.14.1
wsproto 1.2.0
xtcocotools 1.14
xxhash 3.4.1
yacs 0.1.8
yapf 0.30.0
yarl 1.9.4
zhconv 1.4.3
zipp 3.17.0
zstandard 0.19.0
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |