El Mehdi BELAHNECH commited on
Commit ·
1a643e0
1
Parent(s): 3304d6e
Initial commit: code + hf-space (plain folder), ignore index/venv
Browse files- .gitignore +33 -0
- app.py +56 -0
- build_open_dataset_curated.py +139 -0
- hf-space/.gitattributes +36 -0
- hf-space/README.md +13 -0
- hf-space/app.py +56 -0
- hf-space/requirements.txt +6 -0
- hybrid_search.py +156 -0
- index_open_faiss.py +64 -0
- requirements.txt +6 -0
- test_retrieval.py +25 -0
.gitignore
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*.egg-info/
|
| 7 |
+
*.eggs/
|
| 8 |
+
build/
|
| 9 |
+
dist/
|
| 10 |
+
|
| 11 |
+
# venv & OS
|
| 12 |
+
.venv/
|
| 13 |
+
.env/
|
| 14 |
+
.DS_Store
|
| 15 |
+
.tmp-venv/
|
| 16 |
+
|
| 17 |
+
# Notebooks & cache
|
| 18 |
+
.ipynb_checkpoints/
|
| 19 |
+
.cache/
|
| 20 |
+
|
| 21 |
+
# Temp
|
| 22 |
+
tmp/
|
| 23 |
+
logs/
|
| 24 |
+
|
| 25 |
+
# Data / artefacts locaux
|
| 26 |
+
faiss_open_index/
|
| 27 |
+
*.parquet
|
| 28 |
+
*.arrow
|
| 29 |
+
*.faiss
|
| 30 |
+
*.pkl
|
| 31 |
+
|
| 32 |
+
# HF token local si jamais
|
| 33 |
+
.huggingface/
|
app.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py — UI Gradio simple (FAISS-only) avec citations cliquables
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from langchain_community.vectorstores import FAISS
|
| 4 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 5 |
+
|
| 6 |
+
INDEX_DIR = "faiss_open_index"
|
| 7 |
+
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 8 |
+
|
| 9 |
+
def load_vs():
|
| 10 |
+
emb = HuggingFaceEmbeddings(
|
| 11 |
+
model_name=EMBED_MODEL,
|
| 12 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 13 |
+
)
|
| 14 |
+
# L’index doit être présent dans ./faiss_open_index
|
| 15 |
+
return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
|
| 16 |
+
|
| 17 |
+
vs = load_vs()
|
| 18 |
+
|
| 19 |
+
def search(query: str, k: int, lang_filter: str):
|
| 20 |
+
q = (query or "").strip()
|
| 21 |
+
if not q:
|
| 22 |
+
return "<i>Entre une question…</i>"
|
| 23 |
+
docs = vs.similarity_search(q, k=int(k))
|
| 24 |
+
# petit filtre langue (optionnel)
|
| 25 |
+
if lang_filter in ("FR", "EN"):
|
| 26 |
+
keep = "fr" if lang_filter == "FR" else "en"
|
| 27 |
+
docs = [d for d in docs if (d.metadata.get("language","") == keep)] or docs
|
| 28 |
+
|
| 29 |
+
html = []
|
| 30 |
+
for i, d in enumerate(docs, 1):
|
| 31 |
+
title = d.metadata.get("title", "—")
|
| 32 |
+
url = d.metadata.get("url", "#")
|
| 33 |
+
lang = d.metadata.get("language", "—")
|
| 34 |
+
snippet = (d.page_content[:420] + "…").replace("\n", " ")
|
| 35 |
+
html.append(
|
| 36 |
+
f"<div style='margin:10px 0;padding:10px;border:1px solid #eee;border-radius:12px'>"
|
| 37 |
+
f"<div><b>{i}. {title}</b> <span style='opacity:.6'>[{lang}]</span></div>"
|
| 38 |
+
f"<div style='margin:4px 0'><a href='{url}' target='_blank'>{url}</a></div>"
|
| 39 |
+
f"<div style='opacity:.85'>{snippet}</div>"
|
| 40 |
+
f"</div>"
|
| 41 |
+
)
|
| 42 |
+
return "\n".join(html)
|
| 43 |
+
|
| 44 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 45 |
+
gr.Markdown("## 🔎 Experiment Brief — Recherche sourcée (FAISS)")
|
| 46 |
+
with gr.Row():
|
| 47 |
+
q = gr.Textbox(label="Ta question", placeholder="Ex. Différence interleaving vs A/B ?")
|
| 48 |
+
with gr.Row():
|
| 49 |
+
k = gr.Slider(1, 10, value=5, step=1, label="Nombre de passages (k)")
|
| 50 |
+
lang = gr.Radio(choices=["Tous", "FR", "EN"], value="Tous", label="Langue")
|
| 51 |
+
go = gr.Button("Rechercher")
|
| 52 |
+
out = gr.HTML()
|
| 53 |
+
go.click(search, inputs=[q, k, lang], outputs=out)
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
demo.launch()
|
build_open_dataset_curated.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# build_open_dataset_curated.py — Wikipédia FR/EN must-have (strict mais pragmatique)
|
| 2 |
+
from typing import List, Dict, Optional
|
| 3 |
+
import wikipediaapi
|
| 4 |
+
from datasets import Dataset, DatasetDict, Features, Value, Sequence
|
| 5 |
+
|
| 6 |
+
HF_USER = "lmhdii"
|
| 7 |
+
DS_NAME = f"{HF_USER}/experiment-brief-open"
|
| 8 |
+
|
| 9 |
+
# ----- Candidats par thème (on essaie dans l'ordre) -----
|
| 10 |
+
CANDIDATES_EN = {
|
| 11 |
+
"A/B testing": ["A/B testing", "Split testing"],
|
| 12 |
+
"Interleaving": ["Interleaving (information retrieval)", "Team-draft interleaving"],
|
| 13 |
+
"Sequential analysis": ["Sequential analysis"],
|
| 14 |
+
"False discovery rate": ["False discovery rate", "Benjamini–Hochberg procedure", "Benjamini-Hochberg procedure"],
|
| 15 |
+
"Sample size": ["Sample size determination"],
|
| 16 |
+
"Power": ["Power (statistics)"],
|
| 17 |
+
"Non-inferiority": ["Non-inferiority trial"],
|
| 18 |
+
"Equivalence": ["Equivalence test"],
|
| 19 |
+
"Bandit": ["Multi-armed bandit"],
|
| 20 |
+
"Thompson": ["Thompson sampling"],
|
| 21 |
+
"Randomized": ["Randomized controlled trial", "Randomized experiment"],
|
| 22 |
+
"Control": ["Scientific control", "Controlled experiment", "Control group"],
|
| 23 |
+
"EN Interleaving" : ["Team-draft interleaving", "Interleaving (statistics)"],
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
CANDIDATES_FR = {
|
| 27 |
+
"Test A/B": ["Test A/B"],
|
| 28 |
+
"Analyse séquentielle": ["Analyse séquentielle"],
|
| 29 |
+
"FDR": ["Taux de fausses découvertes", "Taux de fausse découverte"],
|
| 30 |
+
"Benjamini-Hochberg": ["Procédure de Benjamini-Hochberg", "Procédure de Benjamini–Hochberg"],
|
| 31 |
+
"Taille d'échantillon": ["Taille d'échantillon", "Échantillon (statistiques)"],
|
| 32 |
+
"Puissance": ["Puissance statistique", "Puissance (statistique)"],
|
| 33 |
+
"Non-infériorité": ["Essai de non-infériorité"],
|
| 34 |
+
"Équivalence": ["Test d'équivalence (statistiques)", "Test d'équivalence"],
|
| 35 |
+
"Bandit": ["Bandit manchot"],
|
| 36 |
+
"Thompson": ["Échantillonnage de Thompson"],
|
| 37 |
+
"Essai randomisé": ["Essai randomisé contrôlé"],
|
| 38 |
+
"Témoin": ["Groupe témoin"], # + proche de "Scientific control"
|
| 39 |
+
"FR FDR" : ["Taux de fausses découvertes", "Taux de fausse découverte (statistiques)"],
|
| 40 |
+
"FR Non-infériorité" : ["Essai de non-infériorité", "Essai de non-infériorité (statistiques)"],
|
| 41 |
+
"FR Équivalence" : ["Test d'équivalence (statistiques)", "Test d'équivalence"],
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
FEATURES = Features({
|
| 45 |
+
"id": Value("string"),
|
| 46 |
+
"source_type": Value("string"),
|
| 47 |
+
"title": Value("string"),
|
| 48 |
+
"url": Value("string"),
|
| 49 |
+
"language": Value("string"),
|
| 50 |
+
"year": Value("string"),
|
| 51 |
+
"topics": Sequence(Value("string")),
|
| 52 |
+
"text": Value("string"),
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
# --- Garde-fou "pertinence domaine" (un poil plus large) ---
|
| 56 |
+
KEYS_EN = [
|
| 57 |
+
"a/b testing","split testing","online controlled experiment","interleaving",
|
| 58 |
+
"information retrieval","sample ratio mismatch","srm","cuped","guardrail",
|
| 59 |
+
"overall evaluation criterion","oec","sequential","false discovery rate",
|
| 60 |
+
"benjamini","multi-armed bandit","thompson sampling","non-inferiority",
|
| 61 |
+
"equivalence test","power (statistics)","sample size","scientific control",
|
| 62 |
+
"controlled experiment","control group","randomized controlled trial"
|
| 63 |
+
]
|
| 64 |
+
KEYS_FR = [
|
| 65 |
+
"test a/b","expérience contrôlée","essai randomisé","analyse séquentielle",
|
| 66 |
+
"taux de fausses découvertes","benjamini","taille d'échantillon",
|
| 67 |
+
"puissance (statistique)","puissance statistique","non-infériorité",
|
| 68 |
+
"test d'équivalence","bandit manchot","échantillonnage de thompson",
|
| 69 |
+
"groupe témoin","essai randomisé contrôlé"
|
| 70 |
+
]
|
| 71 |
+
BLOCK_TITLES = {
|
| 72 |
+
"Audio Video Interleave","Desirable difficulty","Essais (Montaigne)",
|
| 73 |
+
"Équivalent métabolique","Expérience de Stanford"
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
def relevant(title: str, text: str, lang: str) -> bool:
|
| 77 |
+
if title in BLOCK_TITLES:
|
| 78 |
+
return False
|
| 79 |
+
t = (title or "").lower()
|
| 80 |
+
x = (text or "").lower()
|
| 81 |
+
keys = KEYS_EN if lang == "en" else KEYS_FR
|
| 82 |
+
return any(k in t or k in x for k in keys)
|
| 83 |
+
def fetch_best(wiki, lang, candidates):
|
| 84 |
+
# 1) essai strict + garde-fou
|
| 85 |
+
for title in candidates:
|
| 86 |
+
p = wiki.page(title)
|
| 87 |
+
if p.exists() and (p.text or "").strip() and relevant(p.title, p.text, lang):
|
| 88 |
+
return {
|
| 89 |
+
"id": f"wiki::{lang}::{p.title}",
|
| 90 |
+
"source_type": "wiki",
|
| 91 |
+
"title": p.title,
|
| 92 |
+
"url": p.fullurl,
|
| 93 |
+
"language": lang,
|
| 94 |
+
"year": "",
|
| 95 |
+
"topics": [],
|
| 96 |
+
"text": p.text or "",
|
| 97 |
+
}
|
| 98 |
+
# 2) fallback "force include" si la page existe mais le garde-fou est trop strict
|
| 99 |
+
for title in candidates:
|
| 100 |
+
p = wiki.page(title)
|
| 101 |
+
if p.exists() and (p.text or "").strip():
|
| 102 |
+
return {
|
| 103 |
+
"id": f"wiki::{lang}::{p.title}",
|
| 104 |
+
"source_type": "wiki",
|
| 105 |
+
"title": p.title,
|
| 106 |
+
"url": p.fullurl,
|
| 107 |
+
"language": lang,
|
| 108 |
+
"year": "",
|
| 109 |
+
"topics": [],
|
| 110 |
+
"text": p.text or "",
|
| 111 |
+
}
|
| 112 |
+
return None
|
| 113 |
+
|
| 114 |
+
def collect(lang: str, topics: Dict[str,List[str]]) -> List[Dict]:
|
| 115 |
+
wiki = wikipediaapi.Wikipedia(language=lang, user_agent="experiment-brief-assistant/0.4")
|
| 116 |
+
out, seen = [], set()
|
| 117 |
+
for _, cand in topics.items():
|
| 118 |
+
row = fetch_best(wiki, lang, cand)
|
| 119 |
+
if row and row["title"] not in seen:
|
| 120 |
+
out.append(row); seen.add(row["title"])
|
| 121 |
+
print(f"✓ [{lang}] {row['title']}")
|
| 122 |
+
else:
|
| 123 |
+
print(f"⚠️ missing: [{lang}] {cand}")
|
| 124 |
+
return out
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
print("→ Fetch curated EN (patched)…")
|
| 128 |
+
en_rows = collect("en", CANDIDATES_EN)
|
| 129 |
+
print("→ Fetch curated FR (patched)…")
|
| 130 |
+
fr_rows = collect("fr", CANDIDATES_FR)
|
| 131 |
+
|
| 132 |
+
wiki_en = Dataset.from_list(en_rows, features=FEATURES)
|
| 133 |
+
wiki_fr = Dataset.from_list(fr_rows, features=FEATURES)
|
| 134 |
+
dsd = DatasetDict({"wiki_en": wiki_en, "wiki_fr": wiki_fr})
|
| 135 |
+
print({k: len(v) for k, v in dsd.items()})
|
| 136 |
+
|
| 137 |
+
print(f"→ Push to Hub: {DS_NAME}")
|
| 138 |
+
dsd.push_to_hub(DS_NAME, private=False)
|
| 139 |
+
print("✅ Dataset publié (curated patched).")
|
hf-space/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
faiss_open_index/* filter=lfs diff=lfs merge=lfs -text
|
hf-space/README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Experiment Checklist Assistant
|
| 3 |
+
emoji: 🏢
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
hf-space/app.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py — UI Gradio simple (FAISS-only) avec citations cliquables
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from langchain_community.vectorstores import FAISS
|
| 4 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 5 |
+
|
| 6 |
+
INDEX_DIR = "faiss_open_index"
|
| 7 |
+
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 8 |
+
|
| 9 |
+
def load_vs():
|
| 10 |
+
emb = HuggingFaceEmbeddings(
|
| 11 |
+
model_name=EMBED_MODEL,
|
| 12 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 13 |
+
)
|
| 14 |
+
# L’index doit être présent dans ./faiss_open_index
|
| 15 |
+
return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
|
| 16 |
+
|
| 17 |
+
vs = load_vs()
|
| 18 |
+
|
| 19 |
+
def search(query: str, k: int, lang_filter: str):
|
| 20 |
+
q = (query or "").strip()
|
| 21 |
+
if not q:
|
| 22 |
+
return "<i>Entre une question…</i>"
|
| 23 |
+
docs = vs.similarity_search(q, k=int(k))
|
| 24 |
+
# petit filtre langue (optionnel)
|
| 25 |
+
if lang_filter in ("FR", "EN"):
|
| 26 |
+
keep = "fr" if lang_filter == "FR" else "en"
|
| 27 |
+
docs = [d for d in docs if (d.metadata.get("language","") == keep)] or docs
|
| 28 |
+
|
| 29 |
+
html = []
|
| 30 |
+
for i, d in enumerate(docs, 1):
|
| 31 |
+
title = d.metadata.get("title", "—")
|
| 32 |
+
url = d.metadata.get("url", "#")
|
| 33 |
+
lang = d.metadata.get("language", "—")
|
| 34 |
+
snippet = (d.page_content[:420] + "…").replace("\n", " ")
|
| 35 |
+
html.append(
|
| 36 |
+
f"<div style='margin:10px 0;padding:10px;border:1px solid #eee;border-radius:12px'>"
|
| 37 |
+
f"<div><b>{i}. {title}</b> <span style='opacity:.6'>[{lang}]</span></div>"
|
| 38 |
+
f"<div style='margin:4px 0'><a href='{url}' target='_blank'>{url}</a></div>"
|
| 39 |
+
f"<div style='opacity:.85'>{snippet}</div>"
|
| 40 |
+
f"</div>"
|
| 41 |
+
)
|
| 42 |
+
return "\n".join(html)
|
| 43 |
+
|
| 44 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 45 |
+
gr.Markdown("## 🔎 Experiment Brief — Recherche sourcée (FAISS)")
|
| 46 |
+
with gr.Row():
|
| 47 |
+
q = gr.Textbox(label="Ta question", placeholder="Ex. Différence interleaving vs A/B ?")
|
| 48 |
+
with gr.Row():
|
| 49 |
+
k = gr.Slider(1, 10, value=5, step=1, label="Nombre de passages (k)")
|
| 50 |
+
lang = gr.Radio(choices=["Tous", "FR", "EN"], value="Tous", label="Langue")
|
| 51 |
+
go = gr.Button("Rechercher")
|
| 52 |
+
out = gr.HTML()
|
| 53 |
+
go.click(search, inputs=[q, k, lang], outputs=out)
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
demo.launch()
|
hf-space/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy<2
|
| 2 |
+
faiss-cpu==1.7.4
|
| 3 |
+
gradio>=4.25,<5
|
| 4 |
+
langchain-community>=0.2,<0.4
|
| 5 |
+
sentence-transformers>=2.2,<3
|
| 6 |
+
huggingface-hub>=0.20
|
hybrid_search.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# hybrid_search.py — Hybrid retrieval (FAISS dense + BM25 lexical) + query rewrite + domain filter + SRM boost
|
| 2 |
+
# Compatible Python 3.9
|
| 3 |
+
|
| 4 |
+
from typing import List, Tuple, Dict
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
from langchain_community.vectorstores import FAISS
|
| 9 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 10 |
+
from langchain_community.retrievers import BM25Retriever
|
| 11 |
+
from langchain_core.documents import Document
|
| 12 |
+
|
| 13 |
+
# ---------- Config ----------
|
| 14 |
+
INDEX_DIR = "faiss_open_index" # dossier créé par index_open_faiss.py
|
| 15 |
+
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 16 |
+
|
| 17 |
+
# Expansion de requêtes (ajoute du contexte domaine)
|
| 18 |
+
EXPAND: Dict[str, List[str]] = {
|
| 19 |
+
r"\binterleaving\b": [
|
| 20 |
+
"team-draft interleaving", "search ranking evaluation", "information retrieval"
|
| 21 |
+
],
|
| 22 |
+
r"\ba/?b\b": [
|
| 23 |
+
"ab testing", "split testing", "online controlled experiment", "randomized experiment"
|
| 24 |
+
],
|
| 25 |
+
r"\bsrm\b|sample ratio mismatch": [
|
| 26 |
+
"randomization check", "allocation imbalance", "allocation ratio",
|
| 27 |
+
"chi-squared test", "goodness of fit", "pearson chi-squared", "A/A test"
|
| 28 |
+
],
|
| 29 |
+
r"\bcuped\b": ["variance reduction", "covariate adjustment"],
|
| 30 |
+
r"\bguardrail(s)?\b": ["guardrail metric", "overall evaluation criterion", "oec"],
|
| 31 |
+
r"\bsequential\b": ["sequential analysis", "alpha spending", "group sequential"],
|
| 32 |
+
r"\bfdr\b|\bfalse discovery rate\b": ["benjamini", "benjamini–hochberg"],
|
| 33 |
+
r"\bbandit\b": ["multi-armed bandit", "thompson sampling"],
|
| 34 |
+
r"\bnon[- ]?inferiority\b": ["equivalence test"],
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Termes de domaine pour filtrer les résultats trop génériques
|
| 38 |
+
DOMAIN_TERMS = [
|
| 39 |
+
"a/b testing","ab testing","split testing","online controlled experiment",
|
| 40 |
+
"interleaving","team-draft interleaving","information retrieval",
|
| 41 |
+
"sample ratio mismatch","srm","randomization check","allocation ratio",
|
| 42 |
+
"chi-squared","pearson","goodness of fit","a/a test",
|
| 43 |
+
"cuped","guardrail","overall evaluation criterion","oec",
|
| 44 |
+
"sequential analysis","alpha spending","false discovery rate","benjamini",
|
| 45 |
+
"multi-armed bandit","thompson sampling","non-inferiority","equivalence test",
|
| 46 |
+
"power analysis","sample size determination","control group","scientific control",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# Légers boosts pour les requêtes SRM
|
| 50 |
+
BOOST_TERMS = [
|
| 51 |
+
"sample ratio mismatch","randomization check","allocation ratio",
|
| 52 |
+
"chi-squared","goodness of fit","a/a test"
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
# ---------- Utilitaires ----------
|
| 56 |
+
def rewrite(q: str) -> str:
|
| 57 |
+
"""Ajoute des termes de domaine en fonction de la requête."""
|
| 58 |
+
qn = q.lower()
|
| 59 |
+
extra: List[str] = []
|
| 60 |
+
for pat, terms in EXPAND.items():
|
| 61 |
+
if re.search(pat, qn):
|
| 62 |
+
extra += terms
|
| 63 |
+
return q if not extra else f"{q} " + " ".join(extra)
|
| 64 |
+
|
| 65 |
+
def rrf(dense_hits: List[Tuple[Document, float]],
|
| 66 |
+
sparse_hits: List[Document],
|
| 67 |
+
k: int = 60,
|
| 68 |
+
topk: int = 5) -> List[Document]:
|
| 69 |
+
"""Reciprocal Rank Fusion (RRF) — fusionne l'ordre dense et lexical."""
|
| 70 |
+
score = defaultdict(float)
|
| 71 |
+
|
| 72 |
+
for rank, (doc, _) in enumerate(dense_hits, start=1):
|
| 73 |
+
score[id(doc)] += 1.0 / (k + rank)
|
| 74 |
+
|
| 75 |
+
for rank, doc in enumerate(sparse_hits, start=1):
|
| 76 |
+
score[id(doc)] += 1.0 / (k + rank)
|
| 77 |
+
|
| 78 |
+
uniq: Dict[int, Document] = {}
|
| 79 |
+
for (doc, _) in dense_hits:
|
| 80 |
+
uniq[id(doc)] = doc
|
| 81 |
+
for doc in sparse_hits:
|
| 82 |
+
uniq[id(doc)] = doc
|
| 83 |
+
|
| 84 |
+
ranked = sorted(uniq.values(), key=lambda d: score[id(d)], reverse=True)
|
| 85 |
+
return ranked[:topk]
|
| 86 |
+
|
| 87 |
+
def is_domain(doc: Document) -> bool:
|
| 88 |
+
"""Filtre simple : conserve les docs contenant des termes de notre domaine."""
|
| 89 |
+
hay = (doc.metadata.get("title", "") + " " + doc.page_content).lower()
|
| 90 |
+
return any(t in hay for t in DOMAIN_TERMS)
|
| 91 |
+
|
| 92 |
+
def boost_rank(docs: List[Document]) -> List[Document]:
|
| 93 |
+
"""Boost très simple pour SRM (compte des occurrences)."""
|
| 94 |
+
def score(doc: Document) -> int:
|
| 95 |
+
txt = (doc.page_content + " " + doc.metadata.get("title","")).lower()
|
| 96 |
+
return sum(txt.count(t) for t in BOOST_TERMS)
|
| 97 |
+
return sorted(docs, key=score, reverse=True)
|
| 98 |
+
|
| 99 |
+
# ---------- Chargement index + BM25 ----------
|
| 100 |
+
def load_retrievers():
|
| 101 |
+
emb = HuggingFaceEmbeddings(
|
| 102 |
+
model_name=EMBED_MODEL,
|
| 103 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 104 |
+
)
|
| 105 |
+
# FAISS inclut les chunks quand on sauvegarde via save_local
|
| 106 |
+
vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
|
| 107 |
+
|
| 108 |
+
# Construire BM25 sur les mêmes documents (chunks)
|
| 109 |
+
docs = list(vs.docstore._dict.values())
|
| 110 |
+
bm25 = BM25Retriever.from_documents(docs)
|
| 111 |
+
bm25.k = 12 # top lexical
|
| 112 |
+
|
| 113 |
+
return vs, bm25
|
| 114 |
+
|
| 115 |
+
# ---------- API de recherche ----------
|
| 116 |
+
class HybridSearcher:
|
| 117 |
+
def __init__(self):
|
| 118 |
+
self.vs, self.bm25 = load_retrievers()
|
| 119 |
+
|
| 120 |
+
def search(self, q: str, k_dense: int = 12, k_final: int = 5):
|
| 121 |
+
q2 = rewrite(q)
|
| 122 |
+
|
| 123 |
+
# denses + scores
|
| 124 |
+
dense = self.vs.similarity_search_with_score(q2, k=k_dense)
|
| 125 |
+
|
| 126 |
+
# lexical — utiliser .invoke() (évite le warning deprecation)
|
| 127 |
+
sparse = self.bm25.invoke(q2)
|
| 128 |
+
|
| 129 |
+
fused = rrf(dense, sparse, topk=max(k_final * 3, 12))
|
| 130 |
+
filtered = [d for d in fused if is_domain(d)]
|
| 131 |
+
candidates = filtered or fused
|
| 132 |
+
results = boost_rank(candidates)[:k_final]
|
| 133 |
+
return results, q2
|
| 134 |
+
|
| 135 |
+
# ---------- CLI ----------
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
try:
|
| 138 |
+
hs = HybridSearcher()
|
| 139 |
+
print("Hybrid search prêt ✅ (FAISS + BM25).")
|
| 140 |
+
while True:
|
| 141 |
+
q = input("\nTa question (ENTER pour quitter): ").strip()
|
| 142 |
+
if not q:
|
| 143 |
+
break
|
| 144 |
+
hits, q2 = hs.search(q)
|
| 145 |
+
print(f"\nQuery réécrite: {q2}")
|
| 146 |
+
if not hits:
|
| 147 |
+
print("Aucun résultat.")
|
| 148 |
+
continue
|
| 149 |
+
for i, d in enumerate(hits, 1):
|
| 150 |
+
title = d.metadata.get("title")
|
| 151 |
+
url = d.metadata.get("url")
|
| 152 |
+
lang = d.metadata.get("language")
|
| 153 |
+
snippet = d.page_content[:160].replace("\n", " ")
|
| 154 |
+
print(f"{i}. {title} [{lang}] — {url}\n {snippet} …")
|
| 155 |
+
except KeyboardInterrupt:
|
| 156 |
+
pass
|
index_open_faiss.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# index_open_faiss.py — construit un index FAISS à partir du dataset open
|
| 2 |
+
from datasets import load_dataset, DatasetDict
|
| 3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.vectorstores import FAISS
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
DATASET_ID = "lmhdii/experiment-brief-open" # ← laisse ton ID
|
| 10 |
+
INDEX_DIR = "faiss_open_index"
|
| 11 |
+
|
| 12 |
+
def chunk(text, size=900, overlap=150):
|
| 13 |
+
text = re.sub(r"\s+", " ", text or "").strip()
|
| 14 |
+
out, i = [], 0
|
| 15 |
+
while i < len(text):
|
| 16 |
+
out.append(text[i:i+size])
|
| 17 |
+
i += max(size - overlap, 1)
|
| 18 |
+
return out
|
| 19 |
+
|
| 20 |
+
print("→ Loading dataset…")
|
| 21 |
+
dsd = DatasetDict()
|
| 22 |
+
for split in ["wiki_en", "wiki_fr"]:
|
| 23 |
+
try:
|
| 24 |
+
dsd[split] = load_dataset(DATASET_ID, split=split)
|
| 25 |
+
print(f" {split}: {len(dsd[split])} rows")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f" skip {split} ({e})")
|
| 28 |
+
|
| 29 |
+
docs = []
|
| 30 |
+
for split, ds in dsd.items():
|
| 31 |
+
for r in tqdm(ds, desc=f"chunk {split}"):
|
| 32 |
+
meta = {
|
| 33 |
+
"id": r["id"],
|
| 34 |
+
"title": r["title"],
|
| 35 |
+
"url": r["url"],
|
| 36 |
+
"language": r["language"],
|
| 37 |
+
"source_type": r["source_type"],
|
| 38 |
+
"split": split,
|
| 39 |
+
}
|
| 40 |
+
for c in chunk(r["text"]):
|
| 41 |
+
docs.append(Document(page_content=c, metadata=meta))
|
| 42 |
+
|
| 43 |
+
print(f"→ Total chunks: {len(docs)}")
|
| 44 |
+
|
| 45 |
+
# Multilingue FR/EN
|
| 46 |
+
emb = HuggingFaceEmbeddings(
|
| 47 |
+
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 48 |
+
encode_kwargs={"normalize_embeddings": True}
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
print("→ Building FAISS…")
|
| 52 |
+
vs = FAISS.from_documents(docs, emb)
|
| 53 |
+
vs.save_local(INDEX_DIR)
|
| 54 |
+
print(f"✅ Saved index to ./{INDEX_DIR}")
|
| 55 |
+
|
| 56 |
+
# Smoke test
|
| 57 |
+
q = "Qu'est-ce qu'un SRM en A/B testing et comment le diagnostiquer ?"
|
| 58 |
+
retriever = vs.as_retriever(search_kwargs={"k": 5})
|
| 59 |
+
hits = retriever.invoke(q)
|
| 60 |
+
|
| 61 |
+
print("\nTop-5 résultats :")
|
| 62 |
+
for i, d in enumerate(hits, 1):
|
| 63 |
+
print(f"{i}. {d.metadata.get('title')} [{d.metadata.get('language')}] — {d.metadata.get('url')}")
|
| 64 |
+
print(" ", d.page_content[:140].replace("\n", " "), "…")
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy<2
|
| 2 |
+
faiss-cpu==1.7.4
|
| 3 |
+
gradio>=4.25,<5
|
| 4 |
+
langchain-community>=0.2,<0.4
|
| 5 |
+
sentence-transformers>=2.2,<3
|
| 6 |
+
huggingface-hub>=0.20
|
test_retrieval.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.vectorstores import FAISS
|
| 2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 3 |
+
|
| 4 |
+
INDEX_DIR = "faiss_open_index"
|
| 5 |
+
|
| 6 |
+
emb = HuggingFaceEmbeddings(
|
| 7 |
+
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 8 |
+
encode_kwargs={"normalize_embeddings": True}
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
# allow_dangerous_deserialization est nécessaire pour recharger FAISS sauvegardé
|
| 12 |
+
vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
|
| 13 |
+
retriever = vs.as_retriever(search_kwargs={"k": 5})
|
| 14 |
+
|
| 15 |
+
while True:
|
| 16 |
+
try:
|
| 17 |
+
q = input("\nTa question (ENTER pour quitter): ").strip()
|
| 18 |
+
if not q:
|
| 19 |
+
break
|
| 20 |
+
hits = retriever.invoke(q)
|
| 21 |
+
for i, d in enumerate(hits, 1):
|
| 22 |
+
print(f"{i}. {d.metadata.get('title')} — {d.metadata.get('url')}")
|
| 23 |
+
print(" ", d.page_content[:140].replace('\\n',' '), "…")
|
| 24 |
+
except KeyboardInterrupt:
|
| 25 |
+
break
|