链接地址:https://milvus.io/docs/zh/full_text_search_with_milvus.md
full_text_demo:
from typing import List
from __init__ import openai_client
import sys
from pymilvus import (
MilvusClient,
DataType,
Function,
FunctionType,
AnnSearchRequest,
RRFRanker,
)
# Connect to Milvus:连接到 Milvus
uri = "http://ip:19530"
collection_name = "full_text_demo"
client = MilvusClient(uri=uri)
print("连接成功")
# sys.exit()
analyzer_params = {"tokenizer": "standard", "filter": ["lowercase"]}
schema = MilvusClient.create_schema()
schema.add_field(
field_name="id",
datatype=DataType.VARCHAR,
is_primary=True,
auto_id=True,
max_length=100,
)
schema.add_field(
field_name="content",
datatype=DataType.VARCHAR,
max_length=65535,
analyzer_params=analyzer_params,
enable_match=True, # Enable text matching
enable_analyzer=True, # Enable text analysis
)
schema.add_field(field_name="sparse_vector", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(
field_name="dense_vector",
datatype=DataType.FLOAT_VECTOR,
dim=1536, # Dimension for text-embedding-3-small
)
schema.add_field(field_name="metadata", datatype=DataType.JSON)
bm25_function = Function(
name="bm25",
function_type=FunctionType.BM25,
input_field_names=["content"],
output_field_names="sparse_vector",
)
schema.add_function(bm25_function)
# 创建索引
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
field_name="sparse_vector",
index_type="SPARSE_INVERTED_INDEX",
metric_type="BM25",
)
index_params.add_index(field_name="dense_vector", index_type="FLAT", metric_type="IP")
if client.has_collection(collection_name):
client.drop_collection(collection_name)
client.create_collection(
collection_name=collection_name,
schema=schema,
index_params=index_params,
)
print(f"Collection '{collection_name}' created successfully")
# openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
model_name = "text-embedding-3-small"
def get_embeddings(texts: List[str]) -> List[List[float]]:
if not texts:
return []
response = openai_client.embeddings.create(input=texts, model=model_name)
return [embedding.embedding for embedding in response.data]
# Define indexes
index_params = MilvusClient.prepare_index_params()
index_params.add_index(
field_name="sparse_vector",
index_type="SPARSE_INVERTED_INDEX",
metric_type="BM25",
)
index_params.add_index(field_name="dense_vector", index_type="FLAT", metric_type="IP")
# Drop collection if exist
if client.has_collection(collection_name):
client.drop_collection(collection_name)
# Create the collection
client.create_collection(
collection_name=collection_name,
schema=schema,
index_params=index_params,
)
print(f"Collection '{collection_name}' created successfully")
# Set up OpenAI for embeddings
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
model_name = "text-embedding-3-small"
# Define embedding generation function for reuse
def get_embeddings(texts: List[str]) -> List[List[float]]:
if not texts:
return []
response = openai_client.embeddings.create(input=texts, model=model_name)
return [embedding.embedding for embedding in response.data]
# Example documents to insert
documents = [
{
"content": "Milvus is a vector database built for embedding similarity search and AI applications.",
"metadata": {"source": "documentation", "topic": "introduction"},
},
{
"content": "Full-text search in Milvus allows you to search using keywords and phrases.",
"metadata": {"source": "tutorial", "topic": "full-text search"},
},
{
"content": "Hybrid search combines the power of sparse BM25 retrieval with dense vector search.",
"metadata": {"source": "blog", "topic": "hybrid search"},
},
]
# Prepare entities for insertion
entities = []
texts = [doc["content"] for doc in documents]
embeddings = get_embeddings(texts)
for i, doc in enumerate(documents):
entities.append(
{
"content": doc["content"],
"dense_vector": embeddings[i],
"metadata": doc.get("metadata", {}),
}
)
# Insert data
client.insert(collection_name, entities)
print(f"Inserted {len(entities)} documents")
# Example query for semantic search
query = "How does Milvus help with similarity search?"
# Generate embedding for query
query_embedding = get_embeddings([query])[0]
# Semantic search using dense vectors
results = client.search(
collection_name=collection_name,
data=[query_embedding],
anns_field="dense_vector",
limit=5,
output_fields=["content", "metadata"],
)
dense_results = results[0]
# Print results
print("\nDense Search (Semantic):")
for i, result in enumerate(dense_results):
print(
f"{i+1}. Score: {result['distance']:.4f}, Content: {result['entity']['content']}"
)