linux apache发布php网站网络舆情监控
2026/1/11 6:33:06 网站建设 项目流程
linux apache发布php网站,网络舆情监控,购卡网页怎么制作,电商销售主要做什么FAISS 在实验阶段确实好用#xff0c;速度快、上手容易#xff0c;notebook 里跑起来很顺手。但把它搬到生产环境还是有很多问题#xff1a; 首先是元数据的问题#xff0c;FAISS 索引只认向量#xff0c;如果想按日期或其他条件筛选还需要自己另外搞一套查找系统。 其次…FAISS 在实验阶段确实好用速度快、上手容易notebook 里跑起来很顺手。但把它搬到生产环境还是有很多问题首先是元数据的问题FAISS 索引只认向量如果想按日期或其他条件筛选还需要自己另外搞一套查找系统。其次它本质上是个库而不是服务让如果想对外提供接口还得自己用 Flask 或 FastAPI 包一层。最后最麻烦的是持久化pod 一旦挂掉索引就没了除非提前手动存盘。Qdrant 的出现解决了这些痛点它更像是个真正的数据库提供开箱即用的 API、数据重启后依然在、原生支持元数据过滤。更关键的是混合搜索Dense Sparse和量化这些高级功能都是内置的。MS MARCO Passages 数据集数据集地址MS MARCO 官方页面https://microsoft.github.io/msmarco/这次用的是 MS MARCO Passage Ranking 数据集信息检索领域的标准测试集。数据是从网页抓取的约880万条短文本段落选它的原因很简单段落短平均50词不用处理复杂的文本分块可以把精力放在迁移工程本身。实际测试时用了10万条数据的子集这样速度会很快嵌入模型用的是 sentence-transformers/all-MiniLM-L6-v2输出384维的稠密向量。SentenceTransformers 模型地址https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2FAISS 阶段的初始配置生成嵌入向量加载原始数据批量生成嵌入向量。这里关键的一步是把结果存成 .npy 文件避免后续重复计算。import pandas as pd from sentence_transformers import SentenceTransformer import numpy as np import os import csv DATA_PATH ../data TSV_FILE f{DATA_PATH}/collection.tsv SAMPLE_SIZE 100000 MODEL_ID all-MiniLM-L6-v2 def prepare_data(): print(fLoading Model {MODEL_ID}...) model SentenceTransformer(MODEL_ID) print(fReading first {SAMPLE_SIZE} lines from {TSV_FILE}...) ids [] passages [] # Efficiently read line-by-line without loading entire 8GB file to RAM try: with open(TSV_FILE, r, encodingutf8) as f: reader csv.reader(f, delimiter\t) for i, row in enumerate(reader): if i SAMPLE_SIZE: break # MS MARCO format is: [pid, text] if len(row) 2: ids.append(int(row[0])) passages.append(row[1]) except FileNotFoundError: print(fError: Could not find {TSV_FILE}) return print(fLoaded {len(passages)} passages.) # Save text metadata (for Qdrant payload) print(Saving metadata to CSV...) df pd.DataFrame({id: ids, text: passages}) df.to_csv(f{DATA_PATH}/passages.csv, indexFalse) # Generate Embeddings print(Encoding Embeddings (this may take a moment)...) embeddings model.encode(passages, show_progress_barTrue) # Save binary files (for FAISS and Qdrant) print(5. Saving numpy arrays...) np.save(f{DATA_PATH}/embeddings.npy, embeddings) np.save(f{DATA_PATH}/ids.npy, np.array(ids)) print(fSuccess! Saved {embeddings.shape} embeddings to {DATA_PATH}) if __name__ __main__: os.makedirs(DATA_PATH, exist_okTrue) prepare_data()构建索引用 IndexFlatL2 做精确搜索对于百万级别的数据量来说足够了。import faiss import numpy as np import os DATA_PATH ../data INDEX_OUTPUT_PATH ./my_index.faiss def build_index(): print(Loading embeddings...) # Load the vectors if not os.path.exists(f{DATA_PATH}/embeddings.npy): print(fError: {DATA_PATH}/embeddings.npy not found.) return embeddings np.load(f{DATA_PATH}/embeddings.npy) d embeddings.shape[1] # Dimension (should be 384 for MiniLM) print(fBuilding Index (Dimension{d})...) # We use IndexFlatL2 for exact search (Simple Accurate for 1M vectors). index faiss.IndexFlatL2(d) index.add(embeddings) print(fSaving index to {INDEX_OUTPUT_PATH}..) faiss.write_index(index, INDEX_OUTPUT_PATH) print(fSuccess! Index contains {index.ntotal} vectors.) if __name__ __main__: os.makedirs(os.path.dirname(INDEX_OUTPUT_PATH), exist_okTrue) build_index()语义搜索测试随便跑一个查询就能看出问题了。返回的是 [42, 105] 这种 ID如果想拿到实际文本还得写一堆代码去 CSV 里查这种割裂感是迁移的主要原因。import faiss import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer INDEX_PATH ./my_index.faiss DATA_PATH ../data MODEL_NAME all-MiniLM-L6-v2 def search_faiss(): print(Loading Index and Metadata...) index faiss.read_index(INDEX_PATH) # LIMITATION: We must manually load the CSV to get text back. # FAISS only stores vectors, not the text itself. df pd.read_csv(f{DATA_PATH}/passages.csv) model SentenceTransformer(MODEL_NAME) # userquery query_text What is the capital of France? print(f\nQuery: {query_text}) # Encode and Search query_vector model.encode([query_text]) D, I index.search(query_vector, k3) # Search for top 3 results print(\n--- Results ---) for rank, idx in enumerate(I[0]): # LIMITATION: If we wanted to filter by text_length 50, # we would have to fetch ALL results first, then filter in Python. # FAISS cannot filter during search. text df.iloc[idx][text] # Manual lookup score D[0][rank] print(f[{rank1}] ID: {idx} | Score: {score:.4f}) print(f Text: {text[:100]}...) if __name__ __main__: search_faiss()迁移步骤从 FAISS 导出向量前面步骤已经有 embeddings.npy 了直接加载 numpy 数组就行省去了导出环节。本地启动 Qdrant 很简单docker run -p6333:6333 qdrant/qdrantCollection 配置文档https://qdrant.tech/documentation/concepts/collections/from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, HnswConfigDiff QDRANT_URL http://localhost:6333 COLLECTION_NAME ms_marco_passages def create_collection(): client QdrantClient(urlQDRANT_URL) print(fCreating collection {COLLECTION_NAME}...) client.recreate_collection( collection_nameCOLLECTION_NAME, vectors_configVectorParams( size384,# Dimension (MiniLM)- we should follow the existing dimension from FAISS distanceDistance.COSINE ), hnsw_configHnswConfigDiff( m16, # Links per node (default is 16) ef_construct100 # Search depth during build (default is 100) ) ) print(fCollection {COLLECTION_NAME} created with HNSW config.) if __name__ __main__: create_collection()批量上传数据Qdrant Python 客户端文档https://qdrant.tech/documentation/clients/python/import pandas as pd import numpy as np from qdrant_client import QdrantClient from qdrant_client.models import PointStruct QDRANT_URL http://localhost:6333 COLLECTION_NAME ms_marco_passages DATA_PATH ../data BATCH_SIZE 500 def upload_data(): client QdrantClient(urlQDRANT_URL) print(Loading local data...) embeddings np.load(f{DATA_PATH}/embeddings.npy) df_meta pd.read_csv(f{DATA_PATH}/passages.csv) total len(df_meta) print(fStarting upload of {total} vectors...) points_batch [] for i, row in df_meta.iterrows(): # Metadata to attach payload { passage_id: int(row[id]), text: row[text], text_length: len(str(row[text])), dataset_source: msmarco_passages } points_batch.append(PointStruct( idint(row[id]), vectorembeddings[i].tolist(), payloadpayload )) # Upload batch if len(points_batch) BATCH_SIZE or i total - 1: client.upsert( collection_nameCOLLECTION_NAME, pointspoints_batch ) points_batch [] if i % 1000 0: print(f Processed {i}/{total}...) print(Upload Complete.) if __name__ __main__: upload_data()验证迁移结果from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, Range, MatchValue from sentence_transformers import SentenceTransformer QDRANT_URL http://localhost:6333 COLLECTION_NAME ms_marco_passages MODEL_NAME all-MiniLM-L6-v2 def validate_migration(): client QdrantClient(urlQDRANT_URL) model SentenceTransformer(MODEL_NAME) # Verify total count count_result client.count(COLLECTION_NAME) print(fTotal Vectors in Qdrant: {count_result.count}) # Query example query_text What is a GPU? print(f\n--- Query: {query_text} ---) query_vector model.encode(query_text).tolist() # Filter Definition print(Applying filters (Length 200 AND Source msmarco)...) search_filter Filter( must[ FieldCondition( keytext_length, rangeRange(lt200) # can be changed as per the requirement ), FieldCondition( keydataset_source, matchMatchValue(valuemsmarco_passages) ) ] ) results client.query_points( collection_nameCOLLECTION_NAME, queryquery_vector, query_filtersearch_filter, limit3 ).points for hit in results: print(f\nID: {hit.id} (Score: {hit.score:.3f})) print(fText: {hit.payload[text]}) print(fMetadata: {hit.payload}) if __name__ __main__: validate_migration()性能对比针对10个常见查询做了对比测试。FAISS本地 CPU约 0.5ms纯数学计算的速度QdrantDocker约 3ms包含了网络传输的开销对 Web 服务来说3ms 的延迟完全可以接受何况换来的是一堆新功能。import time import faiss import numpy as np from qdrant_client import QdrantClient from sentence_transformers import SentenceTransformer FAISS_INDEX_PATH ./faiss_index/my_index.faiss QDRANT_URL http://localhost:6333 COLLECTION_NAME ms_marco_passages MODEL_NAME all-MiniLM-L6-v2 QUERIES [ What is a GPU?, Who is the president of France?, How to bake a cake?, Symptoms of the flu, Python programming language, Best places to visit in Italy, Define quantum mechanics, History of the Roman Empire, What is machine learning?, Healthy breakfast ideas ] def run_comparison(): print(---Loading Resources ---) # Load Model model SentenceTransformer(MODEL_NAME) # Load FAISS (The Old Way) print(Loading FAISS index...) faiss_index faiss.read_index(FAISS_INDEX_PATH) # Connect to Qdrant (The New Way) print(Connecting to Qdrant...) client QdrantClient(urlQDRANT_URL) print(f\n---Running Race ({len(QUERIES)} queries) ---) print(f{Query:30} | {FAISS (ms):10} | {Qdrant (ms):10}) print(- * 60) faiss_times [] qdrant_times [] for query_text in QUERIES: # Encode once query_vector model.encode(query_text).tolist() # --- MEASURE FAISS --- start_f time.perf_counter() # FAISS expects a numpy array of shape (1, d) faiss_input np.array([query_vector], dtypefloat32) _, _ faiss_index.search(faiss_input, k3) end_f time.perf_counter() faiss_ms (end_f - start_f) * 1000 faiss_times.append(faiss_ms) # --- MEASURE QDRANT --- start_q time.perf_counter() _ client.query_points( collection_nameCOLLECTION_NAME, queryquery_vector, limit3 ) end_q time.perf_counter() qdrant_ms (end_q - start_q) * 1000 qdrant_times.append(qdrant_ms) print(f{query_text[:30]:30} | {faiss_ms:10.2f} | {qdrant_ms:10.2f}) print(- * 60) print(f{AVERAGE:30} | {np.mean(faiss_times):10.2f} | {np.mean(qdrant_times):10.2f}) if __name__ __main__: run_comparison()测试结果最大的差异不在速度在于省心。用 FAISS 时有次跑了个索引脚本处理大批数据耗时40分钟占了12GB内存。快完成时 SSH 连接突然断了进程被杀因为 FAISS 只是个跑在内存里的库一切都白费了。换成 Qdrant 就不一样了它像真正的数据库数据推送后会持久化保存即便突然断开 docker 连接重启后数据还在。用过 FAISS 就知道为了把向量 ID 映射回文本还需要额外维护一个 CSV 文件。迁移到 Qdrant 后这些查找逻辑都删掉了文本和向量存在一起直接查询 API 就能拿到完整结果不再需要管理各种文件就是在用一个微服务。迁移总结这次迁移断断续续做了一周但收获很大。最爽的不是写 Qdrant 脚本是删掉旧代码——提交的 PR 几乎全是红色删除行。CSV 加载工具、手动 ID 映射、各种代码全删了代码量减少了30%可读性明显提升。只用 FAISS 时搜索有时像在碰运气——语义上相似但事实错误的结果时常出现。迁移到 Qdrant拿到的不只是数据库更是对系统的掌控力。稠密向量配合关键词过滤混合搜索终于能回答显示 GPU 相关的技术文档但只要官方手册里的这种精确查询这在之前根本做不到。信心的变化最明显以前不敢加载完整的880万数据怕内存撑不住。现在架构解耦了可以把全部数据推给 Qdrant它会在磁盘上处理存储和索引应用层保持轻量。终于有了个在生产环境和 notebook 里都能跑得一样好的系统。总结FAISS 适合离线研究和快速实验但要在生产环境跑起来Qdrant 提供了必需的基础设施。如果还在用额外的 CSV 文件来理解向量含义该考虑迁移了。https://avoid.overfit.cn/post/ce7c45d8373741f6b8af465bb06bc398作者Sai Bhargav Rallapalli

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询