向量数据库Chroma极简教程 - qidao123.com技术社区-IT企服评测·应用市场

pip install chromadb

复制代码

import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection")
collection.add(
documents=["This is a document about engineer", "This is a document about steak"],
metadatas=[{"source": "doc1"}, {"source": "doc2"}],
ids=["id1", "id2"]
)
results = collection.query(
query_texts=["Which food is the best?"],
n_results=2
)
print(results)

复制代码

{
'ids': [
['id2', 'id1']
],
'distances': [
[1.5835548639297485, 2.1740970611572266]
],
'metadatas': [
[{
'source': 'doc2'
}, {
'source': 'doc1'
}]
],
'embeddings': None,
'documents': [
['This is a document about steak', 'This is a document about engineer']
]
}

复制代码

client = chromadb.PersistentClient(path="/Users/yourname/xxxx")

复制代码

client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.
client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

复制代码

collection = client.get_collection(name="test") # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.
collection = client.get_or_create_collection(name="test") # Get a collection object from an existing collection, by name. If it doesn't exist, create it.
client.delete_collection(name="my_collection") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible
collection.peek() # returns a list of the first 10 items in the collection
collection.count() # returns the number of items in the collection
collection.modify(name="new_name") # Rename the collection

复制代码

collection = client.create_collection(
name="collection_name",
metadata={"hnsw:space": "cosine"} # l2 is the default
)

复制代码

collection = client.create_collection(
name="collection_name",
metadata={"hnsw:space": "cosine"} # l2 is the default
)

复制代码

def add(ids: OneOrMany[ID],
embeddings: Optional[OneOrMany[Embedding]] = None,
metadatas: Optional[OneOrMany[Metadata]] = None,
documents: Optional[OneOrMany[Document]] = None) -> None

复制代码

collection.add(
embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
documents=["This is a document", "This is another document"],
metadatas=[{"source": "my_source"}, {"source": "my_source"}],
ids=["id1", "id2"]
)

复制代码

collection.query(
query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
n_results=10,
where={"metadata_field": "is_equal_to_this"},
where_document={"$contains":"search_string"}
)

复制代码

collection.get(
ids=["id1", "id2", "id3", ...],
where={"style": "style1"}
)

复制代码

# Only get documents and ids
collection.get({
include: [ "documents" ]
})
collection.query({
queryEmbeddings: [[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
include: [ "documents" ]
})

复制代码

{
"metadata_field": {
<Operator>: <Value>
}
}

复制代码

# is equivalent to
{
"metadata_field": {
"$eq": "search_string"
}
}

复制代码

# Filtering for a search_string
{
"$contains": "search_string"
}

复制代码

{
"$and": [
{
"metadata_field": {
<Operator>: <Value>
}
},
{
"metadata_field": {
<Operator>: <Value>
}
}
]
}

复制代码

{
"$or": [
{
"metadata_field": {
<Operator>: <Value>
}
},
{
"metadata_field": {
<Operator>: <Value>
}
}
]
}

复制代码

{
"metadata_field": {
"$in": ["value1", "value2", "value3"]
}
}

复制代码

{
"metadata_field": {
"$nin": ["value1", "value2", "value3"]
}
}

复制代码

collection.update(
ids=["id1", "id2", "id3", ...],
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
documents=["doc1", "doc2", "doc3", ...],
)

复制代码

collection.delete(
ids=["id1", "id2", "id3",...],
where={"chapter": "20"}
)

复制代码

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('model_name')

复制代码

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="YOUR_API_KEY",
model_name="text-embedding-ada-002"
)

复制代码

from chromadb import Documents, EmbeddingFunction, Embeddings
class MyEmbeddingFunction(EmbeddingFunction):
def __call__(self, texts: Documents) -> Embeddings:
# embed the documents somehow
return embeddings

复制代码

from langchain.document_loaders import TextLoader
from langchain.embeddings import ModelScopeEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import chardet
# 读取原始文档
raw_documents_sanguo = TextLoader('/Users/rude3knife/Desktop/三国演义.txt', encoding='utf-16').load()
raw_documents_xiyou = TextLoader('/Users/rude3knife/Desktop/西游记.txt', encoding='utf-16').load()
# 分割文档
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents_sanguo = text_splitter.split_documents(raw_documents_sanguo)
documents_xiyou = text_splitter.split_documents(raw_documents_xiyou)
documents = documents_sanguo + documents_xiyou
print("documents nums:", documents.__len__())
# 生成向量（embedding）
model_id = "damo/nlp_corom_sentence-embedding_chinese-base"
embeddings = ModelScopeEmbeddings(model_id=model_id)
db = Chroma.from_documents(documents, embedding=embeddings)
# 检索
query = "美猴王是谁？"
docs = db.similarity_search(query, k=5)
# 打印结果
for doc in docs:
print("===")
print("metadata:", doc.metadata)
print("page_content:", doc.page_content)

复制代码

import chardet
def detect_file_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding']
file_path = '/Users/rude3knife/Desktop/三国演义.txt'
encoding = detect_file_encoding(file_path)
print(f'The encoding of file {file_path} is {encoding}')
# 输出
The encoding of file /Users/yangzhendong/Desktop/三国演义.txt is UTF-16

复制代码

==========
metadata: {'source': '/Users/yangzhendong/Desktop/西游记.txt'}
page_content: 美猴王一见，倒身下拜，磕头不计其数，口中只道：“师父，师父！我弟子志心朝礼，志心朝礼！”祖师道：“你是那方人氏？且说个乡贯姓名明白，再拜。”猴王道：“弟子乃东胜神洲傲来国花果山水帘洞人氏。”祖师喝令：“赶出去！他本是个撒诈捣虚之徒，那里修什么道果！”猴王慌忙磕头不住道：“弟子是老实之言，决无虚诈。”祖师道：“你既老实，怎么说东胜神洲？那去处到我这里，隔两重大海，一座南赡部洲，如何就得到此？”猴王叩头道：“弟子飘洋过海，登界游方，有十数个年头，方才访到此处。”祖师道：“既是逐渐行来的也罢。你姓什么？”猴王又道：“我无性。人若骂我我也不恼，若打我我也不嗔，只是陪个礼儿就罢了，一生无性。”祖师道：“不是这个性。你父母原来姓什么？”猴王道：“我也无父母。”祖师道：“既无父母，想是树上生的？”猴王道：“我虽不是树上生，却是石里长的。我只记得花果山上有一块仙石，其年石破，我便生也。”祖师闻言暗喜道：“这等说，却是个天地生成的，你起来走走我看。”猴王纵身跳起，拐呀拐的走了两遍。
==========
metadata: {'source': '/Users/yangzhendong/Desktop/西游记.txt'}
page_content: 太宗更喜，教：“光禄寺设宴，开东阁酬谢。”忽见他三徒立在阶下，容貌异常，便问：“高徒果外国人耶？”长老俯伏道：“大徒弟姓孙，法名悟空，臣又呼他为孙行者。他出身原是东胜神洲傲来国花果山水帘洞人氏，因五百年前大闹天宫，被佛祖困压在西番两界山石匣之内，蒙观音菩萨劝善，情愿皈依，是臣到彼救出，甚亏此徒保护。二徒弟姓猪，法名悟能，臣又呼他为猪八戒。他出身原是福陵山云栈洞人氏，因在乌斯藏高老庄上作怪，即蒙菩萨劝善，亏行者收之，一路上挑担有力，涉水有功。三徒弟姓沙，法名悟净，臣又呼他为沙和尚。他出身原是流沙河作怪者，也蒙菩萨劝善，秉教沙门。那匹马不是主公所赐者。”太宗道：“毛片相同，如何不是？”三藏道：“臣到蛇盘山鹰愁涧涉水，原马被此马吞之，亏行者请菩萨问此马来历，原是西海龙王之了，因有罪，也蒙菩萨救解，教他与臣作脚力。当时变作原马，毛片相同。幸亏他登山越岭，跋涉崎岖，去时骑坐，来时驮经，亦甚赖其力也。”
==========
metadata: {'source': '/Users/yangzhendong/Desktop/西游记.txt'}
page_content: 第七十回妖魔宝放烟沙火悟空计盗紫金铃
却说那孙行者抖擞神威，持着铁棒，踏祥光起在空中，迎面喝道：“你是那里来的邪魔，待往何方猖獗！”那怪物厉声高叫道：“吾党不是别人，乃麒麟山獬豸洞赛太岁大王爷爷部下先锋，今奉大王令，到此取宫女二名，伏侍金圣娘娘。你是何人，敢来问我！”行者道：“吾乃齐天大圣孙悟空，因保东土唐僧西天拜佛，路过此国，知你这伙邪魔欺主，特展雄才，治国祛邪。正没处寻你，却来此送命！”那怪闻言，不知好歹，展长枪就刺行者。行者举铁棒劈面相迎，在半空里这一场好杀：
棍是龙宫镇海珍，枪乃人间转炼铁。凡兵怎敢比仙兵，擦着些儿神气泄。大圣原来太乙仙，妖精本是邪魔孽。鬼祟焉能近正人，一正之时邪就灭。那个弄风播土唬皇王，这个踏雾腾云遮日月。丢开架子赌输赢，无能谁敢夸豪杰！还是齐天大圣能，乒乓一棍枪先折。
==========
metadata: {'source': '/Users/yangzhendong/Desktop/西游记.txt'}
page_content: 菩萨引众同入里面，与玉帝礼毕，又与老君、王母相见，各坐下，便问：“蟠桃盛会如何？”玉帝道：“每年请会，喜喜欢欢，今年被妖猴作乱，甚是虚邀也。”菩萨道：“妖猴是何出处？”玉帝道：“妖猴乃东胜神洲傲来国花果山石卵化生的。当时生出，即目运金光，射冲斗府。始不介意，继而成精，降龙伏虎，自削死籍。当有龙王、阎王启奏。朕欲擒拿，是长庚星启奏道：‘三界之间，凡有九窍者，可以成仙。’朕即施教育贤，宣他上界，封为御马监弼马温官。那厮嫌恶官小，反了天宫。即差李天王与哪吒太子收降，又降诏抚安，宣至上界，就封他做个‘齐天大圣’，只是有官无禄。他因没事干管理，东游西荡。朕又恐别生事端，着他代管蟠桃园。他又不遵法律，将老树大桃，尽行偷吃。及至设会，他乃无禄人员，不曾请他，他就设计赚哄赤脚大仙，却自变他相貌入会，将仙肴仙酒尽偷吃了，又偷老君仙丹，又偷御酒若干，去与本山众猴享乐。朕心为此烦恼，故调十万天兵，天罗地网收伏。这一日不见回报，不知胜负如何。”
==========
metadata: {'source': '/Users/yangzhendong/Desktop/西游记.txt'}
page_content: 行者道：“实不瞒师父说，老孙五百年前，居花果山水帘洞大展英雄之际，收降七十二洞邪魔，手下有四万七千群怪，头戴的是紫金冠，身穿的是赭黄袍，腰系的是蓝田带，足踏的是步云履，手执的是如意金箍棒，着实也曾为人。自从涅脖罪度，削发秉正沙门，跟你做了徒弟，把这个金箍儿勒在我头上，若回去，却也难见故乡人。师父果若不要我，把那个《松箍儿咒》念一念，退下这个箍子，交付与你，套在别人头上，我就快活相应了，也是跟你一场。莫不成这些人意儿也没有了？”唐僧大惊道：“悟空，我当时只是菩萨暗受一卷《紧箍儿咒》，却没有什么松箍儿咒。”行者道：“若无《松箍儿咒》，你还带我去走走罢。”长老又没奈何道：“你且起来，我再饶你这一次，却不可再行凶了。”行者道：“再不敢了，再不敢了。”又伏侍师父上马，剖路前进。
却说那妖精，原来行者第二棍也不曾打杀他。那怪物在半空中，夸奖不尽道：“好个猴王，着然有眼！我那般变了去，他也还认得我。这些和尚，他去得快，若过此山，西下四十里，就不伏我所管了。若是被别处妖魔捞了去，好道就笑破他人口，使碎自家心，我还下去戏他一戏。”好妖怪，按耸阴风，在山坡下摇身一变，变成一个老公公，真个是：

复制代码