tayyab-077 commited on
Commit
8c9ab96
·
1 Parent(s): c99229a
Files changed (2) hide show
  1. app.py +2 -4
  2. src/model_loader.py +16 -11
app.py CHANGED
@@ -1,10 +1,8 @@
1
- # app.py — CPU-ready with Gemma 2B
2
  import gradio as gr
3
  import os
4
  import tempfile
5
  import textwrap
6
  from datetime import datetime
7
- from pathlib import Path
8
  from typing import List, Dict, Any, Optional
9
 
10
  from src.model_loader import load_local_model
@@ -14,7 +12,7 @@ from src.chatbot import LocalChatbot
14
  # ----------------------
15
  # Model setup
16
  # ----------------------
17
- MODEL_PATH = "models/gemma-2-2b-it-Q4_K_M.gguf" # quantized 2B
18
  llm = load_local_model(MODEL_PATH, device=-1) # CPU
19
  memory = ConversationMemory(max_len=60)
20
  bot = LocalChatbot(llm, memory)
@@ -103,7 +101,7 @@ def generate_reply(user_msg: str, history: Optional[List[Dict[str, Any]]]):
103
  return history
104
 
105
  # ----------------------
106
- # UI
107
  # ----------------------
108
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
109
  with gr.Row():
 
 
1
  import gradio as gr
2
  import os
3
  import tempfile
4
  import textwrap
5
  from datetime import datetime
 
6
  from typing import List, Dict, Any, Optional
7
 
8
  from src.model_loader import load_local_model
 
12
  # ----------------------
13
  # Model setup
14
  # ----------------------
15
+ MODEL_PATH = "models/gemma-2-2b-it-Q4_K_M" # quantized 2B
16
  llm = load_local_model(MODEL_PATH, device=-1) # CPU
17
  memory = ConversationMemory(max_len=60)
18
  bot = LocalChatbot(llm, memory)
 
101
  return history
102
 
103
  # ----------------------
104
+ # Gradio UI
105
  # ----------------------
106
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
107
  with gr.Row():
src/model_loader.py CHANGED
@@ -1,13 +1,18 @@
1
- # src/model_loader.py
2
- from llama_cpp import Llama
3
 
4
- def load_local_model(model_path):
5
- print(f"Loading model: {model_path}")
6
- llm = Llama(
7
- model_path=model_path,
8
- n_ctx=4096,
9
- n_threads=6,
10
- n_gpu_layers=0,
11
- verbose=False
 
 
 
 
 
 
12
  )
13
- return llm
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
2
 
3
+ def load_local_model(model_path, device=0):
4
+ """
5
+ Loads a local quantized model for CPU or GPU.
6
+ device=-1 => CPU, device>=0 => GPU
7
+ """
8
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
9
+ model = AutoModelForCausalLM.from_pretrained(model_path)
10
+
11
+ # Use pipeline for text generation
12
+ generator = pipeline(
13
+ "text-generation",
14
+ model=model,
15
+ tokenizer=tokenizer,
16
+ device=device
17
  )
18
+ return generator