1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
| import weaviate from weaviate.classes.config import Property, DataType from weaviate.classes.config import Configure import json import random import numpy as np
def generate_mock_vector(text, dimension=384): """生成模拟向量(用于学习演示)""" random.seed(hash(text) % 2 ** 32) vector = [random.uniform(-1, 1) for _ in range(dimension)] norm = np.linalg.norm(vector) vector = [v / norm for v in vector] return vector
with weaviate.connect_to_local() as client: if client.is_ready(): print("成功连接到本地Weaviate!")
if client.collections.exists("Article"): client.collections.delete("Article")
try: articles = client.collections.create( name="Article", properties=[ Property(name="title", data_type=DataType.TEXT), Property(name="content", data_type=DataType.TEXT), ], vector_config=Configure.VectorConfig.self_provided() ) print(f"成功创建Collection: {articles.name}") print("注意:此Collection使用自带向量模式,不会自动向量化文本") except Exception as e: print(f"创建Collection时出错: {e}")
data = [ { "title": "成都世运会烟花秀", "content": "成都世运会烟花秀于2025年8月7日晚上8点在秦皇湖举行." }, { "title": "成都世运会官方APP", "content": "作为2025年第12届世界运动会的官方移动应用,'世运通'(Chengdu2025)于近日全新上线,这座城市的热情与世运赛事的精彩,都将浓缩在这一方小小的屏幕里,为全球用户铺就一条通往成都世运会的便捷之路。作为赛会官方移动服务应用平台,世运通以运动员为中心、以赛事为核心,为赛会的组织和运行提供一体化移动整合服务,集成了门票预订、新闻资讯、世运指南等20余项功能。" }, { "title": "Weaviate 介绍", "content": "Weaviate是一个使用Go语言从头构建的开源向量数据库" }, ]
articles_collection = client.collections.get("Article")
for item in data: combined_text = item["title"] + " " + item["content"] vector = generate_mock_vector(combined_text, dimension=384)
articles_collection.data.insert( properties=item, vector=vector )
print(f"{len(data)}篇文章已成功索引(使用自带向量模式)!")
print("\n" + "=" * 50) print("自带向量模式下的搜索示例:") print("=" * 50)
print("\n--- 向量搜索 ---") query_text = "世运会 APP" query_vector = generate_mock_vector(query_text, dimension=384)
try: vector_result = articles_collection.query.near_vector( near_vector=query_vector, limit=2, return_metadata=["distance"] )
print(f"查询: '{query_text}'") for item in vector_result.objects: print(json.dumps({ "title": item.properties["title"], "content": item.properties["content"][:100] + "...", "距离": round(item.metadata.distance, 4) }, indent=2, ensure_ascii=False)) except Exception as e: print(f"向量搜索出错: {e}")
print("\n--- 关键词搜索 ---") try: keyword_result = articles_collection.query.bm25( query="向量数据库", limit=2 )
print(f"关键词搜索: '向量数据库'") if keyword_result.objects: for item in keyword_result.objects: print(json.dumps({ "title": item.properties["title"], "content": item.properties["content"][:100] + "...", "得分": getattr(item.metadata, 'score', 'N/A') }, indent=2, ensure_ascii=False)) else: print("未找到匹配的结果") except Exception as e: print(f"关键词搜索出错: {e}")
print("\n--- 混合搜索(使用 Weaviate 的 hybrid 方法)---") try: query_vec = generate_mock_vector("向量数据库", dimension=384)
vector_result = articles_collection.query.near_vector( near_vector=query_vec, limit=2, return_metadata=["distance"] )
print(f"向量搜索(相当于混合搜索的向量部分): '向量数据库'") for item in vector_result.objects: print(json.dumps({ "title": item.properties["title"], "content": item.properties["content"][:100] + "...", "距离": round(item.metadata.distance, 4) }, indent=2, ensure_ascii=False))
except Exception as e: print(f"搜索出错: {e}")
print("\n--- 手动计算相似度(模拟混合搜索)---") try: query_vec = generate_mock_vector("向量数据库", dimension=384)
all_objects = [] for obj in articles_collection.iterator(): all_objects.append(obj)
results = [] for obj in all_objects: if hasattr(obj, 'vector') and obj.vector: obj_vector = obj.vector if isinstance(obj_vector, dict): obj_vector = next(iter(obj_vector.values()))
similarity = np.dot(obj_vector, query_vec) / (np.linalg.norm(obj_vector) * np.linalg.norm(query_vec)) results.append({ "title": obj.properties["title"], "content": obj.properties["content"], "similarity": round(float(similarity), 4) })
results.sort(key=lambda x: x["similarity"], reverse=True)
print(f"手动相似度计算: '向量数据库'") for r in results[:2]: print(json.dumps(r, indent=2, ensure_ascii=False))
except Exception as e: print(f"手动计算出错: {e}") import traceback
traceback.print_exc()
|