""" sf-ml-baseline v0.1 — inference helpers. Load pre-trained LightGBM + XGBoost + CatBoost ensembles and predict 24h prediction-market outcomes from 5 engineered indicator features. Model weights are in `./weights/`. Licensed CC-BY-4.0 with attribution — see LICENSE. Example: from sf_ml_baseline import SFBaseline model = SFBaseline() p_up = model.predict_direction(price_cents=55, delta_cents=3, iy=12.5, cri=0.6, cvr=0.8) """ from __future__ import annotations from pathlib import Path from typing import Iterable, Sequence import numpy as np import pandas as pd import lightgbm as lgb import xgboost as xgb from catboost import CatBoostClassifier FEATURE_COLS_V1 = ('price_cents', 'delta_cents', 'iy', 'cri', 'cvr') SEEDS = (42, 137, 2026) class SFBaseline: """9-model ensemble (3 architectures × 3 seeds) for 24h direction forecasting.""" def __init__(self, weights_dir: str | Path | None = None): if weights_dir is None: weights_dir = Path(__file__).parent / 'weights' self.weights_dir = Path(weights_dir) if not self.weights_dir.is_dir(): raise FileNotFoundError(f'weights dir not found: {self.weights_dir}') self._load_t1_models() self._load_t4_models() def _load_t1_models(self): """V1 × T1 — direction forecast.""" self.lgb_t1 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v1_t1_seed{s}.txt')) for s in SEEDS] self.xgb_t1 = [] for s in SEEDS: m = xgb.XGBClassifier() m.load_model(str(self.weights_dir / f'xgb_v1_t1_seed{s}.json')) self.xgb_t1.append(m) self.cat_t1 = [] for s in SEEDS: m = CatBoostClassifier() m.load_model(str(self.weights_dir / f'cat_v1_t1_seed{s}.cbm')) self.cat_t1.append(m) def _load_t4_models(self): """V2 × T4 — resolution forecast. Requires 35 features (price + rolling stats) — see docs/ml/phase-a-results.md. This helper loads them but the user must supply the full 35-feature vector (via the training-data pipeline).""" try: self.lgb_t4 = [lgb.Booster(model_file=str(self.weights_dir / f'lgbm_v2_t4_seed{s}.txt')) for s in SEEDS] self.xgb_t4 = [] for s in SEEDS: m = xgb.XGBClassifier() m.load_model(str(self.weights_dir / f'xgb_v2_t4_seed{s}.json')) self.xgb_t4.append(m) self.cat_t4 = [] for s in SEEDS: m = CatBoostClassifier() m.load_model(str(self.weights_dir / f'cat_v2_t4_seed{s}.cbm')) self.cat_t4.append(m) self.t4_features = list(self.lgb_t4[0].feature_name()) except Exception: self.lgb_t4 = self.xgb_t4 = self.cat_t4 = None self.t4_features = None # --- direction forecast (T1) --- def predict_direction_batch(self, df: pd.DataFrame) -> np.ndarray: """Return P(up-move in 24h) for each row in df. df must have columns: price_cents, delta_cents, iy, cri, cvr.""" missing = [c for c in FEATURE_COLS_V1 if c not in df.columns] if missing: raise ValueError(f'missing feature columns: {missing}') X = df[list(FEATURE_COLS_V1)].astype('float32').values return self._predict_t1(X) def predict_direction( self, price_cents: float, delta_cents: float, iy: float, cri: float, cvr: float, ) -> float: """Single-row prediction. Returns scalar probability.""" X = np.array([[price_cents, delta_cents, iy, cri, cvr]], dtype='float32') return float(self._predict_t1(X)[0]) def _predict_t1(self, X: np.ndarray) -> np.ndarray: """9-model ensemble: 3 LGBM + 3 XGB + 3 Cat, equal weight.""" preds = [] for m in self.lgb_t1: preds.append(m.predict(X)) for m in self.xgb_t1: preds.append(m.predict_proba(X)[:, 1]) for m in self.cat_t1: preds.append(m.predict_proba(X)[:, 1]) return np.mean(preds, axis=0) # --- resolution forecast (T4) --- def predict_resolution_batch(self, df: pd.DataFrame) -> np.ndarray: """Return P(YES resolution in 24h) for each row in df. df must have all 35 V2 features (base 5 + rolling stats). See self.t4_features.""" if self.lgb_t4 is None: raise RuntimeError('T4 weights not loaded') missing = [c for c in self.t4_features if c not in df.columns] if missing: raise ValueError(f'missing V2 feature columns: {missing[:5]}... ({len(missing)} total)') X = df[self.t4_features].astype('float32').values preds = [] for m in self.lgb_t4: preds.append(m.predict(X)) for m in self.xgb_t4: preds.append(m.predict_proba(X)[:, 1]) for m in self.cat_t4: preds.append(m.predict_proba(X)[:, 1]) return np.mean(preds, axis=0) if __name__ == '__main__': # smoke test model = SFBaseline() p = model.predict_direction(price_cents=55, delta_cents=3, iy=12.5, cri=0.6, cvr=0.8) print(f'P(price rises in 24h | market at 55c, +3c delta, iy=12.5%, cri=0.6, cvr=0.8) = {p:.3f}') p_batch = model.predict_direction_batch(pd.DataFrame([ {'price_cents': 55, 'delta_cents': 3, 'iy': 12.5, 'cri': 0.6, 'cvr': 0.8}, {'price_cents': 82, 'delta_cents': -1, 'iy': 4.5, 'cri': 0.3, 'cvr': 0.9}, ])) print(f'Batch predictions: {p_batch}')