El Mehdi BELAHNECH commited on
Commit
1a643e0
·
1 Parent(s): 3304d6e

Initial commit: code + hf-space (plain folder), ignore index/venv

Browse files
.gitignore ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.egg-info/
7
+ *.eggs/
8
+ build/
9
+ dist/
10
+
11
+ # venv & OS
12
+ .venv/
13
+ .env/
14
+ .DS_Store
15
+ .tmp-venv/
16
+
17
+ # Notebooks & cache
18
+ .ipynb_checkpoints/
19
+ .cache/
20
+
21
+ # Temp
22
+ tmp/
23
+ logs/
24
+
25
+ # Data / artefacts locaux
26
+ faiss_open_index/
27
+ *.parquet
28
+ *.arrow
29
+ *.faiss
30
+ *.pkl
31
+
32
+ # HF token local si jamais
33
+ .huggingface/
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — UI Gradio simple (FAISS-only) avec citations cliquables
2
+ import gradio as gr
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+
6
+ INDEX_DIR = "faiss_open_index"
7
+ EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
8
+
9
+ def load_vs():
10
+ emb = HuggingFaceEmbeddings(
11
+ model_name=EMBED_MODEL,
12
+ encode_kwargs={"normalize_embeddings": True},
13
+ )
14
+ # L’index doit être présent dans ./faiss_open_index
15
+ return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
16
+
17
+ vs = load_vs()
18
+
19
+ def search(query: str, k: int, lang_filter: str):
20
+ q = (query or "").strip()
21
+ if not q:
22
+ return "<i>Entre une question…</i>"
23
+ docs = vs.similarity_search(q, k=int(k))
24
+ # petit filtre langue (optionnel)
25
+ if lang_filter in ("FR", "EN"):
26
+ keep = "fr" if lang_filter == "FR" else "en"
27
+ docs = [d for d in docs if (d.metadata.get("language","") == keep)] or docs
28
+
29
+ html = []
30
+ for i, d in enumerate(docs, 1):
31
+ title = d.metadata.get("title", "—")
32
+ url = d.metadata.get("url", "#")
33
+ lang = d.metadata.get("language", "—")
34
+ snippet = (d.page_content[:420] + "…").replace("\n", " ")
35
+ html.append(
36
+ f"<div style='margin:10px 0;padding:10px;border:1px solid #eee;border-radius:12px'>"
37
+ f"<div><b>{i}. {title}</b> <span style='opacity:.6'>[{lang}]</span></div>"
38
+ f"<div style='margin:4px 0'><a href='{url}' target='_blank'>{url}</a></div>"
39
+ f"<div style='opacity:.85'>{snippet}</div>"
40
+ f"</div>"
41
+ )
42
+ return "\n".join(html)
43
+
44
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
45
+ gr.Markdown("## 🔎 Experiment Brief — Recherche sourcée (FAISS)")
46
+ with gr.Row():
47
+ q = gr.Textbox(label="Ta question", placeholder="Ex. Différence interleaving vs A/B ?")
48
+ with gr.Row():
49
+ k = gr.Slider(1, 10, value=5, step=1, label="Nombre de passages (k)")
50
+ lang = gr.Radio(choices=["Tous", "FR", "EN"], value="Tous", label="Langue")
51
+ go = gr.Button("Rechercher")
52
+ out = gr.HTML()
53
+ go.click(search, inputs=[q, k, lang], outputs=out)
54
+
55
+ if __name__ == "__main__":
56
+ demo.launch()
build_open_dataset_curated.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_open_dataset_curated.py — Wikipédia FR/EN must-have (strict mais pragmatique)
2
+ from typing import List, Dict, Optional
3
+ import wikipediaapi
4
+ from datasets import Dataset, DatasetDict, Features, Value, Sequence
5
+
6
+ HF_USER = "lmhdii"
7
+ DS_NAME = f"{HF_USER}/experiment-brief-open"
8
+
9
+ # ----- Candidats par thème (on essaie dans l'ordre) -----
10
+ CANDIDATES_EN = {
11
+ "A/B testing": ["A/B testing", "Split testing"],
12
+ "Interleaving": ["Interleaving (information retrieval)", "Team-draft interleaving"],
13
+ "Sequential analysis": ["Sequential analysis"],
14
+ "False discovery rate": ["False discovery rate", "Benjamini–Hochberg procedure", "Benjamini-Hochberg procedure"],
15
+ "Sample size": ["Sample size determination"],
16
+ "Power": ["Power (statistics)"],
17
+ "Non-inferiority": ["Non-inferiority trial"],
18
+ "Equivalence": ["Equivalence test"],
19
+ "Bandit": ["Multi-armed bandit"],
20
+ "Thompson": ["Thompson sampling"],
21
+ "Randomized": ["Randomized controlled trial", "Randomized experiment"],
22
+ "Control": ["Scientific control", "Controlled experiment", "Control group"],
23
+ "EN Interleaving" : ["Team-draft interleaving", "Interleaving (statistics)"],
24
+ }
25
+
26
+ CANDIDATES_FR = {
27
+ "Test A/B": ["Test A/B"],
28
+ "Analyse séquentielle": ["Analyse séquentielle"],
29
+ "FDR": ["Taux de fausses découvertes", "Taux de fausse découverte"],
30
+ "Benjamini-Hochberg": ["Procédure de Benjamini-Hochberg", "Procédure de Benjamini–Hochberg"],
31
+ "Taille d'échantillon": ["Taille d'échantillon", "Échantillon (statistiques)"],
32
+ "Puissance": ["Puissance statistique", "Puissance (statistique)"],
33
+ "Non-infériorité": ["Essai de non-infériorité"],
34
+ "Équivalence": ["Test d'équivalence (statistiques)", "Test d'équivalence"],
35
+ "Bandit": ["Bandit manchot"],
36
+ "Thompson": ["Échantillonnage de Thompson"],
37
+ "Essai randomisé": ["Essai randomisé contrôlé"],
38
+ "Témoin": ["Groupe témoin"], # + proche de "Scientific control"
39
+ "FR FDR" : ["Taux de fausses découvertes", "Taux de fausse découverte (statistiques)"],
40
+ "FR Non-infériorité" : ["Essai de non-infériorité", "Essai de non-infériorité (statistiques)"],
41
+ "FR Équivalence" : ["Test d'équivalence (statistiques)", "Test d'équivalence"],
42
+ }
43
+
44
+ FEATURES = Features({
45
+ "id": Value("string"),
46
+ "source_type": Value("string"),
47
+ "title": Value("string"),
48
+ "url": Value("string"),
49
+ "language": Value("string"),
50
+ "year": Value("string"),
51
+ "topics": Sequence(Value("string")),
52
+ "text": Value("string"),
53
+ })
54
+
55
+ # --- Garde-fou "pertinence domaine" (un poil plus large) ---
56
+ KEYS_EN = [
57
+ "a/b testing","split testing","online controlled experiment","interleaving",
58
+ "information retrieval","sample ratio mismatch","srm","cuped","guardrail",
59
+ "overall evaluation criterion","oec","sequential","false discovery rate",
60
+ "benjamini","multi-armed bandit","thompson sampling","non-inferiority",
61
+ "equivalence test","power (statistics)","sample size","scientific control",
62
+ "controlled experiment","control group","randomized controlled trial"
63
+ ]
64
+ KEYS_FR = [
65
+ "test a/b","expérience contrôlée","essai randomisé","analyse séquentielle",
66
+ "taux de fausses découvertes","benjamini","taille d'échantillon",
67
+ "puissance (statistique)","puissance statistique","non-infériorité",
68
+ "test d'équivalence","bandit manchot","échantillonnage de thompson",
69
+ "groupe témoin","essai randomisé contrôlé"
70
+ ]
71
+ BLOCK_TITLES = {
72
+ "Audio Video Interleave","Desirable difficulty","Essais (Montaigne)",
73
+ "Équivalent métabolique","Expérience de Stanford"
74
+ }
75
+
76
+ def relevant(title: str, text: str, lang: str) -> bool:
77
+ if title in BLOCK_TITLES:
78
+ return False
79
+ t = (title or "").lower()
80
+ x = (text or "").lower()
81
+ keys = KEYS_EN if lang == "en" else KEYS_FR
82
+ return any(k in t or k in x for k in keys)
83
+ def fetch_best(wiki, lang, candidates):
84
+ # 1) essai strict + garde-fou
85
+ for title in candidates:
86
+ p = wiki.page(title)
87
+ if p.exists() and (p.text or "").strip() and relevant(p.title, p.text, lang):
88
+ return {
89
+ "id": f"wiki::{lang}::{p.title}",
90
+ "source_type": "wiki",
91
+ "title": p.title,
92
+ "url": p.fullurl,
93
+ "language": lang,
94
+ "year": "",
95
+ "topics": [],
96
+ "text": p.text or "",
97
+ }
98
+ # 2) fallback "force include" si la page existe mais le garde-fou est trop strict
99
+ for title in candidates:
100
+ p = wiki.page(title)
101
+ if p.exists() and (p.text or "").strip():
102
+ return {
103
+ "id": f"wiki::{lang}::{p.title}",
104
+ "source_type": "wiki",
105
+ "title": p.title,
106
+ "url": p.fullurl,
107
+ "language": lang,
108
+ "year": "",
109
+ "topics": [],
110
+ "text": p.text or "",
111
+ }
112
+ return None
113
+
114
+ def collect(lang: str, topics: Dict[str,List[str]]) -> List[Dict]:
115
+ wiki = wikipediaapi.Wikipedia(language=lang, user_agent="experiment-brief-assistant/0.4")
116
+ out, seen = [], set()
117
+ for _, cand in topics.items():
118
+ row = fetch_best(wiki, lang, cand)
119
+ if row and row["title"] not in seen:
120
+ out.append(row); seen.add(row["title"])
121
+ print(f"✓ [{lang}] {row['title']}")
122
+ else:
123
+ print(f"⚠️ missing: [{lang}] {cand}")
124
+ return out
125
+
126
+ if __name__ == "__main__":
127
+ print("→ Fetch curated EN (patched)…")
128
+ en_rows = collect("en", CANDIDATES_EN)
129
+ print("→ Fetch curated FR (patched)…")
130
+ fr_rows = collect("fr", CANDIDATES_FR)
131
+
132
+ wiki_en = Dataset.from_list(en_rows, features=FEATURES)
133
+ wiki_fr = Dataset.from_list(fr_rows, features=FEATURES)
134
+ dsd = DatasetDict({"wiki_en": wiki_en, "wiki_fr": wiki_fr})
135
+ print({k: len(v) for k, v in dsd.items()})
136
+
137
+ print(f"→ Push to Hub: {DS_NAME}")
138
+ dsd.push_to_hub(DS_NAME, private=False)
139
+ print("✅ Dataset publié (curated patched).")
hf-space/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ faiss_open_index/* filter=lfs diff=lfs merge=lfs -text
hf-space/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Experiment Checklist Assistant
3
+ emoji: 🏢
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
hf-space/app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — UI Gradio simple (FAISS-only) avec citations cliquables
2
+ import gradio as gr
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+
6
+ INDEX_DIR = "faiss_open_index"
7
+ EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
8
+
9
+ def load_vs():
10
+ emb = HuggingFaceEmbeddings(
11
+ model_name=EMBED_MODEL,
12
+ encode_kwargs={"normalize_embeddings": True},
13
+ )
14
+ # L’index doit être présent dans ./faiss_open_index
15
+ return FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
16
+
17
+ vs = load_vs()
18
+
19
+ def search(query: str, k: int, lang_filter: str):
20
+ q = (query or "").strip()
21
+ if not q:
22
+ return "<i>Entre une question…</i>"
23
+ docs = vs.similarity_search(q, k=int(k))
24
+ # petit filtre langue (optionnel)
25
+ if lang_filter in ("FR", "EN"):
26
+ keep = "fr" if lang_filter == "FR" else "en"
27
+ docs = [d for d in docs if (d.metadata.get("language","") == keep)] or docs
28
+
29
+ html = []
30
+ for i, d in enumerate(docs, 1):
31
+ title = d.metadata.get("title", "—")
32
+ url = d.metadata.get("url", "#")
33
+ lang = d.metadata.get("language", "—")
34
+ snippet = (d.page_content[:420] + "…").replace("\n", " ")
35
+ html.append(
36
+ f"<div style='margin:10px 0;padding:10px;border:1px solid #eee;border-radius:12px'>"
37
+ f"<div><b>{i}. {title}</b> <span style='opacity:.6'>[{lang}]</span></div>"
38
+ f"<div style='margin:4px 0'><a href='{url}' target='_blank'>{url}</a></div>"
39
+ f"<div style='opacity:.85'>{snippet}</div>"
40
+ f"</div>"
41
+ )
42
+ return "\n".join(html)
43
+
44
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
45
+ gr.Markdown("## 🔎 Experiment Brief — Recherche sourcée (FAISS)")
46
+ with gr.Row():
47
+ q = gr.Textbox(label="Ta question", placeholder="Ex. Différence interleaving vs A/B ?")
48
+ with gr.Row():
49
+ k = gr.Slider(1, 10, value=5, step=1, label="Nombre de passages (k)")
50
+ lang = gr.Radio(choices=["Tous", "FR", "EN"], value="Tous", label="Langue")
51
+ go = gr.Button("Rechercher")
52
+ out = gr.HTML()
53
+ go.click(search, inputs=[q, k, lang], outputs=out)
54
+
55
+ if __name__ == "__main__":
56
+ demo.launch()
hf-space/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numpy<2
2
+ faiss-cpu==1.7.4
3
+ gradio>=4.25,<5
4
+ langchain-community>=0.2,<0.4
5
+ sentence-transformers>=2.2,<3
6
+ huggingface-hub>=0.20
hybrid_search.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # hybrid_search.py — Hybrid retrieval (FAISS dense + BM25 lexical) + query rewrite + domain filter + SRM boost
2
+ # Compatible Python 3.9
3
+
4
+ from typing import List, Tuple, Dict
5
+ from collections import defaultdict
6
+ import re
7
+
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_community.embeddings import HuggingFaceEmbeddings
10
+ from langchain_community.retrievers import BM25Retriever
11
+ from langchain_core.documents import Document
12
+
13
+ # ---------- Config ----------
14
+ INDEX_DIR = "faiss_open_index" # dossier créé par index_open_faiss.py
15
+ EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
16
+
17
+ # Expansion de requêtes (ajoute du contexte domaine)
18
+ EXPAND: Dict[str, List[str]] = {
19
+ r"\binterleaving\b": [
20
+ "team-draft interleaving", "search ranking evaluation", "information retrieval"
21
+ ],
22
+ r"\ba/?b\b": [
23
+ "ab testing", "split testing", "online controlled experiment", "randomized experiment"
24
+ ],
25
+ r"\bsrm\b|sample ratio mismatch": [
26
+ "randomization check", "allocation imbalance", "allocation ratio",
27
+ "chi-squared test", "goodness of fit", "pearson chi-squared", "A/A test"
28
+ ],
29
+ r"\bcuped\b": ["variance reduction", "covariate adjustment"],
30
+ r"\bguardrail(s)?\b": ["guardrail metric", "overall evaluation criterion", "oec"],
31
+ r"\bsequential\b": ["sequential analysis", "alpha spending", "group sequential"],
32
+ r"\bfdr\b|\bfalse discovery rate\b": ["benjamini", "benjamini–hochberg"],
33
+ r"\bbandit\b": ["multi-armed bandit", "thompson sampling"],
34
+ r"\bnon[- ]?inferiority\b": ["equivalence test"],
35
+ }
36
+
37
+ # Termes de domaine pour filtrer les résultats trop génériques
38
+ DOMAIN_TERMS = [
39
+ "a/b testing","ab testing","split testing","online controlled experiment",
40
+ "interleaving","team-draft interleaving","information retrieval",
41
+ "sample ratio mismatch","srm","randomization check","allocation ratio",
42
+ "chi-squared","pearson","goodness of fit","a/a test",
43
+ "cuped","guardrail","overall evaluation criterion","oec",
44
+ "sequential analysis","alpha spending","false discovery rate","benjamini",
45
+ "multi-armed bandit","thompson sampling","non-inferiority","equivalence test",
46
+ "power analysis","sample size determination","control group","scientific control",
47
+ ]
48
+
49
+ # Légers boosts pour les requêtes SRM
50
+ BOOST_TERMS = [
51
+ "sample ratio mismatch","randomization check","allocation ratio",
52
+ "chi-squared","goodness of fit","a/a test"
53
+ ]
54
+
55
+ # ---------- Utilitaires ----------
56
+ def rewrite(q: str) -> str:
57
+ """Ajoute des termes de domaine en fonction de la requête."""
58
+ qn = q.lower()
59
+ extra: List[str] = []
60
+ for pat, terms in EXPAND.items():
61
+ if re.search(pat, qn):
62
+ extra += terms
63
+ return q if not extra else f"{q} " + " ".join(extra)
64
+
65
+ def rrf(dense_hits: List[Tuple[Document, float]],
66
+ sparse_hits: List[Document],
67
+ k: int = 60,
68
+ topk: int = 5) -> List[Document]:
69
+ """Reciprocal Rank Fusion (RRF) — fusionne l'ordre dense et lexical."""
70
+ score = defaultdict(float)
71
+
72
+ for rank, (doc, _) in enumerate(dense_hits, start=1):
73
+ score[id(doc)] += 1.0 / (k + rank)
74
+
75
+ for rank, doc in enumerate(sparse_hits, start=1):
76
+ score[id(doc)] += 1.0 / (k + rank)
77
+
78
+ uniq: Dict[int, Document] = {}
79
+ for (doc, _) in dense_hits:
80
+ uniq[id(doc)] = doc
81
+ for doc in sparse_hits:
82
+ uniq[id(doc)] = doc
83
+
84
+ ranked = sorted(uniq.values(), key=lambda d: score[id(d)], reverse=True)
85
+ return ranked[:topk]
86
+
87
+ def is_domain(doc: Document) -> bool:
88
+ """Filtre simple : conserve les docs contenant des termes de notre domaine."""
89
+ hay = (doc.metadata.get("title", "") + " " + doc.page_content).lower()
90
+ return any(t in hay for t in DOMAIN_TERMS)
91
+
92
+ def boost_rank(docs: List[Document]) -> List[Document]:
93
+ """Boost très simple pour SRM (compte des occurrences)."""
94
+ def score(doc: Document) -> int:
95
+ txt = (doc.page_content + " " + doc.metadata.get("title","")).lower()
96
+ return sum(txt.count(t) for t in BOOST_TERMS)
97
+ return sorted(docs, key=score, reverse=True)
98
+
99
+ # ---------- Chargement index + BM25 ----------
100
+ def load_retrievers():
101
+ emb = HuggingFaceEmbeddings(
102
+ model_name=EMBED_MODEL,
103
+ encode_kwargs={"normalize_embeddings": True},
104
+ )
105
+ # FAISS inclut les chunks quand on sauvegarde via save_local
106
+ vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
107
+
108
+ # Construire BM25 sur les mêmes documents (chunks)
109
+ docs = list(vs.docstore._dict.values())
110
+ bm25 = BM25Retriever.from_documents(docs)
111
+ bm25.k = 12 # top lexical
112
+
113
+ return vs, bm25
114
+
115
+ # ---------- API de recherche ----------
116
+ class HybridSearcher:
117
+ def __init__(self):
118
+ self.vs, self.bm25 = load_retrievers()
119
+
120
+ def search(self, q: str, k_dense: int = 12, k_final: int = 5):
121
+ q2 = rewrite(q)
122
+
123
+ # denses + scores
124
+ dense = self.vs.similarity_search_with_score(q2, k=k_dense)
125
+
126
+ # lexical — utiliser .invoke() (évite le warning deprecation)
127
+ sparse = self.bm25.invoke(q2)
128
+
129
+ fused = rrf(dense, sparse, topk=max(k_final * 3, 12))
130
+ filtered = [d for d in fused if is_domain(d)]
131
+ candidates = filtered or fused
132
+ results = boost_rank(candidates)[:k_final]
133
+ return results, q2
134
+
135
+ # ---------- CLI ----------
136
+ if __name__ == "__main__":
137
+ try:
138
+ hs = HybridSearcher()
139
+ print("Hybrid search prêt ✅ (FAISS + BM25).")
140
+ while True:
141
+ q = input("\nTa question (ENTER pour quitter): ").strip()
142
+ if not q:
143
+ break
144
+ hits, q2 = hs.search(q)
145
+ print(f"\nQuery réécrite: {q2}")
146
+ if not hits:
147
+ print("Aucun résultat.")
148
+ continue
149
+ for i, d in enumerate(hits, 1):
150
+ title = d.metadata.get("title")
151
+ url = d.metadata.get("url")
152
+ lang = d.metadata.get("language")
153
+ snippet = d.page_content[:160].replace("\n", " ")
154
+ print(f"{i}. {title} [{lang}] — {url}\n {snippet} …")
155
+ except KeyboardInterrupt:
156
+ pass
index_open_faiss.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # index_open_faiss.py — construit un index FAISS à partir du dataset open
2
+ from datasets import load_dataset, DatasetDict
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_core.documents import Document
6
+ from tqdm import tqdm
7
+ import re
8
+
9
+ DATASET_ID = "lmhdii/experiment-brief-open" # ← laisse ton ID
10
+ INDEX_DIR = "faiss_open_index"
11
+
12
+ def chunk(text, size=900, overlap=150):
13
+ text = re.sub(r"\s+", " ", text or "").strip()
14
+ out, i = [], 0
15
+ while i < len(text):
16
+ out.append(text[i:i+size])
17
+ i += max(size - overlap, 1)
18
+ return out
19
+
20
+ print("→ Loading dataset…")
21
+ dsd = DatasetDict()
22
+ for split in ["wiki_en", "wiki_fr"]:
23
+ try:
24
+ dsd[split] = load_dataset(DATASET_ID, split=split)
25
+ print(f" {split}: {len(dsd[split])} rows")
26
+ except Exception as e:
27
+ print(f" skip {split} ({e})")
28
+
29
+ docs = []
30
+ for split, ds in dsd.items():
31
+ for r in tqdm(ds, desc=f"chunk {split}"):
32
+ meta = {
33
+ "id": r["id"],
34
+ "title": r["title"],
35
+ "url": r["url"],
36
+ "language": r["language"],
37
+ "source_type": r["source_type"],
38
+ "split": split,
39
+ }
40
+ for c in chunk(r["text"]):
41
+ docs.append(Document(page_content=c, metadata=meta))
42
+
43
+ print(f"→ Total chunks: {len(docs)}")
44
+
45
+ # Multilingue FR/EN
46
+ emb = HuggingFaceEmbeddings(
47
+ model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
48
+ encode_kwargs={"normalize_embeddings": True}
49
+ )
50
+
51
+ print("→ Building FAISS…")
52
+ vs = FAISS.from_documents(docs, emb)
53
+ vs.save_local(INDEX_DIR)
54
+ print(f"✅ Saved index to ./{INDEX_DIR}")
55
+
56
+ # Smoke test
57
+ q = "Qu'est-ce qu'un SRM en A/B testing et comment le diagnostiquer ?"
58
+ retriever = vs.as_retriever(search_kwargs={"k": 5})
59
+ hits = retriever.invoke(q)
60
+
61
+ print("\nTop-5 résultats :")
62
+ for i, d in enumerate(hits, 1):
63
+ print(f"{i}. {d.metadata.get('title')} [{d.metadata.get('language')}] — {d.metadata.get('url')}")
64
+ print(" ", d.page_content[:140].replace("\n", " "), "…")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numpy<2
2
+ faiss-cpu==1.7.4
3
+ gradio>=4.25,<5
4
+ langchain-community>=0.2,<0.4
5
+ sentence-transformers>=2.2,<3
6
+ huggingface-hub>=0.20
test_retrieval.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import FAISS
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+
4
+ INDEX_DIR = "faiss_open_index"
5
+
6
+ emb = HuggingFaceEmbeddings(
7
+ model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
8
+ encode_kwargs={"normalize_embeddings": True}
9
+ )
10
+
11
+ # allow_dangerous_deserialization est nécessaire pour recharger FAISS sauvegardé
12
+ vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
13
+ retriever = vs.as_retriever(search_kwargs={"k": 5})
14
+
15
+ while True:
16
+ try:
17
+ q = input("\nTa question (ENTER pour quitter): ").strip()
18
+ if not q:
19
+ break
20
+ hits = retriever.invoke(q)
21
+ for i, d in enumerate(hits, 1):
22
+ print(f"{i}. {d.metadata.get('title')} — {d.metadata.get('url')}")
23
+ print(" ", d.page_content[:140].replace('\\n',' '), "…")
24
+ except KeyboardInterrupt:
25
+ break