# pip install langchain langchain-community faiss-cpu sentence-transformers numpy
from langchain.schema import Document
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
# --------------------------
# 0) Données & helpers
# --------------------------
def build_text(doc, w_name=2.0, w_desc=1.0, w_kw=3.0):
name = (doc.get("name") or "").strip()
desc = (doc.get("description") or "").strip()
kw = doc.get("keywords") or ""
if isinstance(kw, list):
kw = ", ".join(map(str, kw))
return ((" " + name) * int(w_name)) + " " + desc + ((" " + kw) * int(w_kw))
def minmax_norm(x):
x = np.asarray(x, dtype=float)
if x.size == 0: return x
mn, mx = float(np.min(x)), float(np.max(x))
if mx - mn < 1e-12: # tous égaux
return np.zeros_like(x)
return (x - mn) / (mx - mn)
def boost_bge_with_keywords(vec_scores, docs, query, alpha=0.1):
q = (query or "").lower()
boosted = vec_scores.copy()
for i, d in enumerate(docs):
kws = d.metadata.get("keywords", [])
if isinstance(kws, str):
kws = kws.split(",")
kws = [k.strip().lower() for k in kws if k and k.strip()]
if kws and any(k in q for k in kws):
boosted[i] += alpha
return boosted
# --------------------------
# 1) Préparer les Documents
# --------------------------
raw_docs = [
{"id": 1, "name": "Application Finance",
"description": "Outil de gestion de portefeuilles et d'analyse financière.",
"keywords": ["finance", "portefeuille", "marchés"]},
{"id": 2, "name": "App RH",
"description": "Gestion des employés, paie et suivi des carrières.",
"keywords": "RH, paie, congés"},
{"id": 3, "name": "CRM Ventes",
"description": "Suivi des clients et opportunités commerciales, pipeline ventes.",
"keywords": ["CRM", "opportunités", "ventes"]},
]
weights_fields = dict(w_name=2.0, w_desc=1.0, w_kw=3.0)
docs = []
id_map = [] # pour garder l'index global
for i, d in enumerate(raw_docs):
text = build_text(d, **weights_fields)
docs.append(
Document(
page_content=text,
metadata={
"id": d.get("id", i),
"name": d.get("name", ""),
"description": d.get("description", ""),
"keywords": d.get("keywords", []),
"global_idx": i, # index stable 0..N-1
},
)
)
id_map.append(i)
N = len(docs)
# --------------------------
# 2) BM25 Retriever (lexical)
# --------------------------
bm25 = BM25Retriever.from_documents(docs)
bm25.k = N # on veut tous les docs pour une fusion propre
# --------------------------
# 3) VectorStore FAISS + BGE-M3 (sémantique)
# --------------------------
# Important: normaliser les embeddings pour cos sim
emb = HuggingFaceEmbeddings(
model_name="BAAI/bge-m3",
encode_kwargs={"normalize_embeddings": True}
)
vstore = FAISS.from_documents(docs, emb)
# BGE-M3 recommande d’ajouter une instruction de requête côté query :
QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
# --------------------------
# 4) Recherche + fusion min-max
# --------------------------
def search_hybrid_minmax_langchain(query, k=5, w_bm25=0.5, w_vec=0.5, kw_alpha=0.12):
# BM25 : pas de scores natifs -> on crée des pseudo-scores à partir du rang
bm25_hits = bm25.get_relevant_documents(query) # top N
bm25_scores = np.zeros(N, dtype=float)
for rank, d in enumerate(bm25_hits, start=1):
i = d.metadata["global_idx"]
# pseudo-score décroissant (rang 1 > rang 2 ...)
bm25_scores[i] = 1.0 / (rank + 0.0)
# BGE : similarité avec score natif
# on demande N résultats pour aligner tous les indices
bge_hits = vstore.similarity_search_with_score(QUERY_PREFIX + query, k=N)
vec_scores = np.zeros(N, dtype=float)
for d, s in bge_hits:
i = d.metadata["global_idx"]
vec_scores[i] = float(s) # s = cos sim si embeddings normalisés
# Boost BGE par keywords (optionnel)
vec_scores = boost_bge_with_keywords(vec_scores, docs, query, alpha=kw_alpha)
# Normalisation min-max
bm25_norm = minmax_norm(bm25_scores)
vec_norm = minmax_norm(vec_scores)
fused = w_bm25 * bm25_norm + w_vec * vec_norm
order = np.argsort(-fused)[:k]
results = []
for i in order:
d = docs[i]
results.append({
"id": d.metadata["id"],
"name": d.metadata["name"],
"score_fused": float(fused[i]),
"bm25": float(bm25_norm[i]),
"bge": float(vec_norm[i]),
"snippet": (d.metadata["description"] or "")[:220],
})
return results
# --------------------------
# 5) Exemple
# --------------------------
if __name__ == "__main__":
query = "gestion financière de portefeuilles"
out = search_hybrid_minmax_langchain(query, k=3, w_bm25=0.5, w_vec=0.5, kw_alpha=0.12)
for r in out:
print(f"[{r['id']}] {r['name']} fused={r['score_fused']:.3f} "
f"(bm25={r['bm25']:.3f}, bge={r['bge']:.3f})")