Spaces:

lianghsun
/

LLM-Compute-Optimal-Estimator

Sleeping

App Files Files Community

lianghsun commited on Oct 16

Commit

87cbd1b

1 Parent(s): 21e139d

First commit

Browse files

Files changed (4) hide show

Dockerfile +1 -1
app.py +161 -0
requirements.txt +75 -3
static/logo_light.png +0 -0

Dockerfile CHANGED Viewed

@@ -17,4 +17,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]


17
18	HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
20	+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# streamlit_app.py
+# Compute-Optimal LLM Training Estimator (Chinchilla-style)
+# ---------------------------------------------------------
+# Usage: `streamlit run streamlit_app.py`
+# This tool helps estimate total FLOPs, steps, wall-clock time, and rough cost
+# for LLM pretraining given model parameters, token budget, and hardware.
+import math
+import streamlit as st
+st.set_page_config(page_title="LLM Compute Estimator", page_icon="🧮", layout="centered")
+st.title("🧮 LLM Compute-Optimal Estimator")
+st.caption("Estimate total FLOPs, wall-clock time, steps, and cost for pretraining — with a Chinchilla-style token rule.")
+# --- Sidebar: assumptions ---
+with st.sidebar:
+    st.logo('./static/logo_light.png')
+    st.header("Assumptions & Notes")
+    st.markdown(
+        """
+        **Formulas**
+        - **Total FLOPs** ≈ `c * N_params * N_tokens`, with default **c = 6** (forward+backward+optimizer overhead).
+        - **Compute-optimal tokens** (rule-of-thumb): `N_tokens ≈ k * N_params`, default **k = 20**.
+        - **Effective compute** = `GPU_count * (peak TFLOPs × 1e12) * efficiency`.
+        **Disclaimers**
+        - This is a *back-of-the-envelope* estimator. Real training efficiency depends on data pipeline, parallelism strategy, sequence length, kernel fusion, optimizer, etc.
+        - Preset TFLOPs are **approximate** and depend on precision (FP8/BF16), sparsity, clocks, and vendor kernels.
+        """
+    )
+# --- 1) Model size & tokens ---
+st.subheader("1) Model & Token Budget")
+col1, col2, col3 = st.columns([1.2, 1, 1])
+with col1:
+    model_params_b = st.number_input("Model size (Billions of parameters)", min_value=0.05, value=4.0, step=0.5, format="%.2f")
+with col2:
+    c_overhead = st.number_input("c (FLOPs constant)", min_value=4.0, value=6.0, step=0.5)
+with col3:
+    k_tokens_per_param = st.number_input("k (tokens per param for compute-optimal)", min_value=5.0, value=20.0, step=1.0)
+use_compute_optimal = st.toggle("Use compute‑optimal tokens (k × params)", value=True)
+if use_compute_optimal:
+    tokens_b = model_params_b * k_tokens_per_param
+    st.info(f"Compute‑optimal token budget ≈ **{tokens_b:,.2f} B** (k = {k_tokens_per_param:g})")
+else:
+    tokens_b = st.number_input("Token budget (Billions)", min_value=1.0, value=80.0, step=5.0, format="%.2f")
+# --- 2) Hardware (moved before batch to define gpu_count first) ---
+st.subheader("2) Hardware")
+col6, col7 = st.columns(2)
+with col6:
+    gpu_preset = st.selectbox(
+        "GPU preset (approx peak TFLOPs per GPU)",
+        (
+            "Custom",
+            "A100 80GB BF16 ≈ 312",
+            "H100 SXM BF16 ≈ 989",
+            "B200 (FP8-ish) ≈ 20000",
+        ),
+        index=0,
+        help="Values are back-of-the-envelope. Choose 'Custom' to enter your own.",
+    )
+preset_map = {
+    "A100 80GB BF16 ≈ 312": 312.0,
+    "H100 SXM BF16 ≈ 989": 989.0,
+    "B200 (FP8-ish) ≈ 20000": 20000.0,
+}
+with col7:
+    if gpu_preset == "Custom":
+        peak_tflops = st.number_input("Peak TFLOPs per GPU (approx)", min_value=10.0, value=20000.0, step=100.0)
+    else:
+        peak_tflops = preset_map[gpu_preset]
+        st.number_input("Peak TFLOPs per GPU (approx)", value=peak_tflops, disabled=True)
+col8, col9, col10 = st.columns(3)
+with col8:
+    gpu_count = st.number_input("GPU count", min_value=1, value=8, step=1)
+with col9:
+    efficiency = st.slider("Training efficiency (MFU, %)", min_value=10, max_value=95, value=50, step=1)
+with col10:
+    price_per_gpu_hour = st.number_input("Price per GPU·hour (USD)", min_value=0.0, value=25.0, step=1.0)
+# --- 3) Batch & Sequence Settings (tokens_per_step computed from gpu_count) ---
+st.subheader("3) Batch & Sequence Settings")
+col4, col5 = st.columns(2)
+with col4:
+    micro_batch = st.number_input("Micro batch size per GPU", min_value=1, value=4, step=1, help="Number of sequences per GPU per optimizer step.")
+with col5:
+    seq_len = st.number_input("Sequence length (tokens)", min_value=128, value=2048, step=128)
+tokens_per_step = micro_batch * seq_len * gpu_count
+st.info(f"Tokens per optimization step ≈ {tokens_per_step:,} (with {gpu_count} GPUs)")
+# --- Compute ---
+N_params = model_params_b * 1e9
+N_tokens = tokens_b * 1e9
+c = c_overhead
+# Total FLOPs (scalar)
+flops_total = c * N_params * N_tokens  # in FLOPs
+# Effective machine compute per second
+effective_flops_per_s = gpu_count * (peak_tflops * 1e12) * (efficiency / 100.0)
+# Time estimate
+seconds = flops_total / effective_flops_per_s if effective_flops_per_s > 0 else float('inf')
+hours = seconds / 3600
+days = hours / 24
+# Steps
+steps = N_tokens / tokens_per_step if tokens_per_step > 0 else float('inf')
+# Throughput
+throughput_tokens_per_s = N_tokens / seconds if seconds > 0 else float('inf')
+# Cost
+cost = price_per_gpu_hour * gpu_count * hours
+# --- Display ---
+st.divider()
+st.subheader("Results")
+colA, colB = st.columns(2)
+with colA:
+    st.metric("Total FLOPs", f"{flops_total:,.2e} FLOPs")
+    st.metric("Effective compute", f"{effective_flops_per_s:,.2e} FLOPs/s")
+    st.metric("Steps (est)", f"{0 if steps == float('inf') else steps:,.0f}")
+with colB:
+    st.metric("Wall‑clock time", f"{hours:,.1f} h  (~{days:,.2f} d)")
+    st.metric("Throughput", f"{0 if throughput_tokens_per_s == float('inf') else throughput_tokens_per_s:,.0f} tok/s")
+    st.metric("Projected cost", f"${0 if cost == float('inf') else cost:,.0f}")
+st.divider()
+st.markdown(
+    f"""
+**Summary**
+- Params: **{model_params_b:,.2f}B** · Tokens: **{tokens_b:,.2f}B** (compute‑optimal: {use_compute_optimal})
+- Constant **c = {c:g}** → Total ≈ **{flops_total:,.2e} FLOPs**.
+- Hardware: **{gpu_count}× GPU**, peak **{peak_tflops:g} TFLOPs/GPU**, MFU **{efficiency}%** → Effective ≈ **{effective_flops_per_s:,.2e} FLOPs/s**.
+- Time ≈ **{hours:,.1f} hours** (≈ {days:,.2f} days).  Steps ≈ **{0 if steps == float('inf') else steps:,.0f}** (@ {tokens_per_step:,} tok/step).
+- Rough cost ≈ **${0 if cost == float('inf') else cost:,.0f}** (@ ${price_per_gpu_hour:g}/GPU·h).
+"""
+)
+with st.expander("What is the Chinchilla rule? Is it 1 epoch?"):
+    st.markdown(
+        """
+        **Chinchilla scaling** is a *compute‑optimal* rule of thumb: for a fixed compute budget, scale
+        the **training tokens** roughly in proportion to the **model parameters** (commonly ~20× tokens per parameter).
+        It is **not** about training for exactly one epoch. In web‑scale pretraining, datasets are often sampled with
+        replacement or mixed; you might see data multiple times or less than once. The rule speaks to the *total number
+        of tokens* the model should process for best use of compute, not to dataset passes.
+        """
+    )
+st.success("Ready. Tweak inputs on the left to explore different scenarios.")

requirements.txt CHANGED Viewed

@@ -1,3 +1,75 @@
-altair
-pandas
-streamlit

+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.10.0
+attrs==25.3.0
+beautifulsoup4==4.13.5
+blinker==1.9.0
+bs4==0.0.2
+cachetools==6.2.0
+certifi==2025.8.3
+charset-normalizer==3.4.3
+click==8.2.1
+datasets==4.1.1
+deprecation==2.1.0
+dill==0.4.0
+distro==1.9.0
+filelock==3.19.1
+frozenlist==1.7.0
+fsspec==2025.9.0
+gitdb==4.0.12
+GitPython==3.1.45
+h11==0.16.0
+hf-xet==1.1.10
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.35.3
+idna==3.10
+Jinja2==3.1.6
+jiter==0.10.0
+jsonschema==4.25.1
+jsonschema-specifications==2025.4.1
+lancedb==0.24.3
+MarkupSafe==3.0.2
+multidict==6.6.4
+multiprocess==0.70.16
+narwhals==2.3.0
+numpy==2.3.2
+openai==1.105.0
+overrides==7.7.0
+packaging==25.0
+pandas==2.3.2
+pillow==11.3.0
+propcache==0.3.2
+protobuf==6.32.0
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydeck==0.9.1
+pylance==0.35.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+pytz==2025.2
+PyYAML==6.0.3
+referencing==0.36.2
+requests==2.32.5
+rpds-py==0.27.1
+setuptools==78.1.1
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.8
+streamlit==1.49.1
+tenacity==9.1.2
+toml==0.10.2
+tornado==6.5.2
+tqdm==4.67.1
+typing-inspection==0.4.1
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0
+wheel==0.45.1
+xxhash==3.5.0
+yarl==1.20.1

static/logo_light.png ADDED Viewed