DocUA commited on
Commit
eb133b8
·
1 Parent(s): eed9900

feat: update ggml kernels, webui components, model templates, and build configurations

Browse files
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python builds
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ venv/
9
+
10
+ # Editor settings
11
+ .DS_Store
12
+
13
+ # Environment files
14
+ .env
15
+ .env.local
16
+
17
+ # Model caches
18
+ models/
19
+ *.safetensors
20
+
21
+ # Test docs
22
+ test_docs/
23
+ help_docs/
24
+
README.md CHANGED
@@ -1,16 +1,61 @@
1
- ---
2
- title: LightOnOCR 1B Demo
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
- license: apache-2.0
14
- ---
15
-
16
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LightOnOCR-1B Demo
2
+
3
+ High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon.
4
+
5
+ ## 🚀 Performance
6
+ - **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
7
+ - **PyTorch Backend:** ~40 seconds per page
8
+
9
+ ## Features
10
+ - 📄 PDF and image support
11
+ - 🔄 Seamless switching between GGUF and PyTorch backends
12
+ - 🎛️ Configurable resolution (scale) and token generation
13
+ - 🖥️ CLI and Gradio web interface
14
+ - 🍎 Full Metal/MPS support
15
+
16
+ ## Quick Start
17
+
18
+ ### 1. Prerequisites
19
+ - Python 3.10+
20
+ - `cmake` and `git`
21
+
22
+ ```bash
23
+ pip install -r requirements.txt
24
+ pip install accelerate
25
+ ```
26
+
27
+ ### 2. Setup GGUF (Highly Recommended)
28
+ See [GGUF Setup Guide](docs/gguf_setup.md).
29
+
30
+ 1. Build `llama.cpp` locally:
31
+ ```bash
32
+ git clone https://github.com/ggerganov/llama.cpp
33
+ cd llama.cpp && mkdir build && cd build
34
+ cmake .. -DGGML_METAL=ON && cmake --build . --config Release -j 8
35
+ cd ../..
36
+ ```
37
+ 2. Download model:
38
+ ```bash
39
+ python download_gguf_model.py
40
+ ```
41
+
42
+ ### 3. Usage
43
+
44
+ **Command Line:**
45
+ ```bash
46
+ # Fastest
47
+ python ocr_cli.py document.pdf --backend gguf
48
+
49
+ # High Quality
50
+ python ocr_cli.py document.pdf --backend gguf --scale 2.0
51
+ ```
52
+
53
+ **Web Interface:**
54
+ ```bash
55
+ python app.py
56
+ ```
57
+ Open http://127.0.0.1:7860 and select **GGUF** backend.
58
+
59
+ ## Documentation
60
+ - [GGUF Setup Guide](docs/gguf_setup.md)
61
+ - [Performance Optimization](docs/performance_optimization.md)
app.py CHANGED
@@ -1,76 +1,92 @@
1
  #!/usr/bin/env python3
 
 
 
 
2
  import os
3
- import json
4
- import base64
5
- import requests
6
  import gradio as gr
 
7
  from PIL import Image
8
- from io import BytesIO
9
  import pypdfium2 as pdfium
10
 
11
- ENDPOINT = os.environ.get("VLLM_ENDPOINT")
12
- MODEL = os.environ.get("VLLM_MODEL")
 
13
 
14
- if not ENDPOINT or not MODEL:
15
- raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")
 
16
 
17
 
18
- def image_to_base64(image):
19
- buffered = BytesIO()
20
- if image.mode == 'RGBA':
21
- image = image.convert('RGB')
22
- image.save(buffered, format="PNG")
23
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
 
 
 
 
 
24
 
25
 
26
- def render_pdf_page(page, max_resolution=1280, scale=2.77):
27
- width, height = page.get_size()
28
- pixel_width = width * scale
29
- pixel_height = height * scale
30
- resize_factor = min(max_resolution / pixel_width, max_resolution / pixel_height)
31
- target_scale = scale * resize_factor
32
- return page.render(scale=target_scale, rev_byteorder=True).to_pil()
33
 
34
 
35
- def process_pdf(pdf_path, num_pages=1):
 
36
  pdf = pdfium.PdfDocument(pdf_path)
37
  total_pages = len(pdf)
38
- pages_to_process = min(num_pages, total_pages, 5)
39
  images = []
40
 
41
  for i in range(pages_to_process):
42
  page = pdf[i]
43
- img = render_pdf_page(page)
44
  images.append(img)
45
 
46
  pdf.close()
47
  return images, total_pages
48
 
49
 
50
- def process_input(file_input, temperature, num_pages):
 
51
  if file_input is None:
52
- yield "Please upload an image or PDF first.", "", "", None
53
  return
54
-
 
 
 
 
 
 
 
 
55
  images_to_process = []
56
  page_info = ""
57
  display_image = None
58
-
59
- file_path = file_input if isinstance(file_input, str) else file_input.name
60
-
61
- if file_path.lower().endswith('.pdf'):
 
 
 
 
62
  try:
63
- images_to_process, total_pages = process_pdf(file_path, num_pages)
64
  if len(images_to_process) == 0:
65
- yield "Error: Could not extract pages from PDF.", "", "", None
66
  return
67
  display_image = images_to_process[0]
68
- if len(images_to_process) == 1:
69
- page_info = f"Processing page 1 of {total_pages}"
70
- else:
71
- page_info = f"Processing {len(images_to_process)} pages of {total_pages}"
72
  except Exception as e:
73
- yield f"Error processing PDF: {str(e)}", "", "", None
74
  return
75
  else:
76
  try:
@@ -79,84 +95,44 @@ def process_input(file_input, temperature, num_pages):
79
  display_image = img
80
  page_info = "Processing image"
81
  except Exception as e:
82
- yield f"Error opening image: {str(e)}", "", "", None
83
  return
84
-
85
- content = [{"type": "text", "text": ""}]
86
-
87
- for img in images_to_process:
88
- try:
89
- b64_image = image_to_base64(img)
90
- content.append({
91
- "type": "image_url",
92
- "image_url": {"url": f"data:image/png;base64,{b64_image}"}
93
- })
94
- except Exception as e:
95
- yield f"Error encoding image: {str(e)}", "", "", display_image
96
- return
97
-
98
- payload = {
99
- "model": MODEL,
100
- "messages": [
101
- {
102
- "role": "user",
103
- "content": content
104
- }
105
- ],
106
- "temperature": temperature,
107
- "stream": True
108
- }
109
 
 
110
  try:
111
- response = requests.post(
112
- ENDPOINT,
113
- headers={"Content-Type": "application/json"},
114
- data=json.dumps(payload),
115
- stream=True
116
- )
117
- response.raise_for_status()
118
-
119
- accumulated_response = ""
120
- first_chunk = True
121
-
122
- for line in response.iter_lines():
123
- if line:
124
- line = line.decode('utf-8')
125
- if line.startswith('data: '):
126
- line = line[6:]
127
-
128
- if line.strip() == '[DONE]':
129
- break
130
-
131
- try:
132
- chunk = json.loads(line)
133
- if 'choices' in chunk and len(chunk['choices']) > 0:
134
- delta = chunk['choices'][0].get('delta', {})
135
- content_delta = delta.get('content', '')
136
- if content_delta:
137
- accumulated_response += content_delta
138
- if first_chunk:
139
- yield accumulated_response, accumulated_response, page_info, display_image
140
- first_chunk = False
141
- else:
142
- yield accumulated_response, accumulated_response, page_info, gr.update()
143
- except json.JSONDecodeError:
144
- continue
145
-
146
  except Exception as e:
147
- error_msg = f"Error: {str(e)}"
148
- yield error_msg, error_msg, page_info, display_image
149
 
150
 
151
- with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
 
152
  gr.Markdown(
153
  """
154
- # 📖 Image/PDF to Text Extraction
155
- **💡 How to use:**
156
- 1. Upload an image or PDF
157
- 2. For PDFs: choose how many pages to process (1-5, default is 1)
158
- 3. Adjust temperature if needed
159
- 4. Click "Extract Text"
160
  """
161
  )
162
 
@@ -168,62 +144,97 @@ with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
168
  type="filepath"
169
  )
170
  rendered_image = gr.Image(
171
- label="📄 Preview (First Page)",
172
  type="pil",
173
- height=400,
174
  interactive=False
175
  )
176
- num_pages = gr.Slider(
177
- minimum=1,
178
- maximum=5,
179
- value=1,
180
- step=1,
181
- label="PDF: Number of Pages to Process",
182
- info="Only applies to PDF files (max 5 pages)"
183
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  page_info = gr.Textbox(
185
  label="Processing Info",
186
  value="",
187
  interactive=False
188
  )
189
- temperature = gr.Slider(
190
- minimum=0.1,
191
- maximum=1.0,
192
- value=0.2,
193
- step=0.05,
194
- label="Temperature"
195
- )
196
- submit_btn = gr.Button("Extract Text", variant="primary")
197
- clear_btn = gr.Button("Clear", variant="secondary")
198
 
199
  with gr.Column(scale=2):
200
- output_text = gr.Markdown(
201
- label="📄 Extracted Text (Rendered)",
202
- value="<div style='min-height: 600px; padding: 10px; border: 1px solid #e0e0e0; border-radius: 4px; background-color: #f9f9f9;'><em>Extracted text will appear here...</em></div>",
203
- height=600
204
- )
205
-
206
- with gr.Row():
207
- with gr.Column():
208
- raw_output = gr.Textbox(
209
- label="Raw Markdown Output",
210
- placeholder="Raw text will appear here...",
211
- lines=20,
212
- max_lines=30,
213
- show_copy_button=True
214
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
 
216
  submit_btn.click(
217
  fn=process_input,
218
- inputs=[file_input, temperature, num_pages],
219
- outputs=[output_text, raw_output, page_info, rendered_image]
220
  )
221
 
222
  clear_btn.click(
223
- fn=lambda: (None, "", "", "", None, 1),
224
- outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
225
  )
226
 
227
 
228
  if __name__ == "__main__":
229
- demo.launch()
 
1
  #!/usr/bin/env python3
2
+ """
3
+ Gradio web interface for LightOnOCR-1B with backend support.
4
+ """
5
+
6
  import os
7
+ import sys
 
 
8
  import gradio as gr
9
+ from pathlib import Path
10
  from PIL import Image
 
11
  import pypdfium2 as pdfium
12
 
13
+ # Add project root to path
14
+ sys.path.insert(0, str(Path(__file__).parent))
15
+ from backends import create_backend, get_available_backends
16
 
17
+ # Global backend
18
+ BACKEND = None
19
+ CURRENT_BACKEND_NAME = "pytorch"
20
 
21
 
22
+ def load_backend(backend_name="pytorch"):
23
+ """Load OCR backend."""
24
+ global BACKEND, CURRENT_BACKEND_NAME
25
+
26
+ if BACKEND is None or CURRENT_BACKEND_NAME != backend_name:
27
+ print(f"Loading {backend_name} backend...")
28
+ BACKEND = create_backend(backend_name)
29
+ BACKEND.load_model()
30
+ CURRENT_BACKEND_NAME = backend_name
31
+ print(f"Backend loaded: {BACKEND.get_backend_info()}")
32
+ return BACKEND
33
 
34
 
35
+ def render_pdf_page(page, scale=2.0):
36
+ """Render PDF page to PIL Image."""
37
+ return page.render(scale=scale, rev_byteorder=True).to_pil()
 
 
 
 
38
 
39
 
40
+ def process_pdf(pdf_path, num_pages=1, scale=2.0):
41
+ """Extract images from PDF."""
42
  pdf = pdfium.PdfDocument(pdf_path)
43
  total_pages = len(pdf)
44
+ pages_to_process = min(num_pages, total_pages, 10) # Max 10 pages
45
  images = []
46
 
47
  for i in range(pages_to_process):
48
  page = pdf[i]
49
+ img = render_pdf_page(page, scale=scale)
50
  images.append(img)
51
 
52
  pdf.close()
53
  return images, total_pages
54
 
55
 
56
+ def process_input(file_input, backend_name, scale, temperature, max_tokens, num_pages):
57
+ """Process uploaded file with OCR."""
58
  if file_input is None:
59
+ yield "Idle", "Please upload an image or PDF first.", "", "", None
60
  return
61
+
62
+ # Load backend
63
+ try:
64
+ backend = load_backend(backend_name)
65
+ except Exception as e:
66
+ error_msg = f"Error loading backend: {str(e)}"
67
+ yield "Error", error_msg, error_msg, "", None
68
+ return
69
+
70
  images_to_process = []
71
  page_info = ""
72
  display_image = None
73
+
74
+ file_path = Path(file_input) if isinstance(file_input, str) else Path(file_input.name)
75
+ if not file_path.exists():
76
+ yield "Error", f"File not accessible: {file_path}", "", "", None
77
+ return
78
+
79
+ # Load images
80
+ if file_path.suffix.lower() == '.pdf':
81
  try:
82
+ images_to_process, total_pages = process_pdf(str(file_path), num_pages, scale)
83
  if len(images_to_process) == 0:
84
+ yield "Error", "Could not extract pages from PDF.", "", "", None
85
  return
86
  display_image = images_to_process[0]
87
+ page_info = f"Processing {len(images_to_process)} of {total_pages} pages"
 
 
 
88
  except Exception as e:
89
+ yield "Error", f"Error processing PDF: {str(e)}", "", "", None
90
  return
91
  else:
92
  try:
 
95
  display_image = img
96
  page_info = "Processing image"
97
  except Exception as e:
98
+ yield "Error", f"Error opening image: {str(e)}", "", "", None
99
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ # Process with OCR
102
  try:
103
+ yield "Processing...", "Processing images...", "", page_info, display_image
104
+
105
+ all_texts = []
106
+ for i, img in enumerate(images_to_process):
107
+ try:
108
+ print(f"Processing page {i+1}/{len(images_to_process)}...")
109
+ text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
110
+ all_texts.append(text.strip())
111
+
112
+ # Update progress
113
+ full_text = "\n\n---\n\n".join(all_texts)
114
+ yield "Processing...", full_text, full_text, page_info, display_image
115
+ except Exception as e:
116
+ error_msg = f"Error on page {i+1}: {str(e)}"
117
+ print(f"ERROR: {error_msg}")
118
+ all_texts.append(f"[{error_msg}]")
119
+ continue
120
+
121
+ # Final result
122
+ final_text = "\n\n---\n\n".join(all_texts)
123
+ yield "Complete", final_text, final_text, page_info, display_image
124
+
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
+ error_msg = f"Error during processing: {str(e)}"
127
+ yield "Error", error_msg, "", page_info, display_image
128
 
129
 
130
+ # Create Gradio interface
131
+ with gr.Blocks(title="📖 LightOnOCR-1B Demo", theme=gr.themes.Soft()) as demo:
132
  gr.Markdown(
133
  """
134
+ # 📖 LightOnOCR-1B - OCR Demo
135
+ Upload an image or PDF to extract text with configurable quality/speed settings.
 
 
 
 
136
  """
137
  )
138
 
 
144
  type="filepath"
145
  )
146
  rendered_image = gr.Image(
147
+ label="📄 Preview",
148
  type="pil",
149
+ height=300,
150
  interactive=False
151
  )
152
+
153
+ with gr.Accordion("⚙️ Settings", open=True):
154
+ backend_selector = gr.Radio(
155
+ choices=get_available_backends(),
156
+ value="pytorch",
157
+ label="Backend",
158
+ info="PyTorch: best quality | GGUF: faster (if available)"
159
+ )
160
+
161
+ scale_slider = gr.Slider(
162
+ minimum=1.0,
163
+ maximum=3.0,
164
+ value=1.5,
165
+ step=0.5,
166
+ label="PDF Scale",
167
+ info="Higher = better quality, slower"
168
+ )
169
+
170
+ max_tokens_slider = gr.Slider(
171
+ minimum=256,
172
+ maximum=2048,
173
+ value=1024,
174
+ step=256,
175
+ label="Max Tokens",
176
+ info="Lower = faster, may cut off long text"
177
+ )
178
+
179
+ num_pages = gr.Slider(
180
+ minimum=1,
181
+ maximum=10,
182
+ value=1,
183
+ step=1,
184
+ label="PDF Pages",
185
+ info="Number of pages to process (max 10)"
186
+ )
187
+
188
+ temperature = gr.Slider(
189
+ minimum=0.0,
190
+ maximum=1.0,
191
+ value=0.1,
192
+ step=0.05,
193
+ label="Temperature",
194
+ info="0 = deterministic"
195
+ )
196
+
197
  page_info = gr.Textbox(
198
  label="Processing Info",
199
  value="",
200
  interactive=False
201
  )
202
+
203
+ submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
204
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary")
 
 
 
 
 
 
205
 
206
  with gr.Column(scale=2):
207
+ status_display = gr.Textbox(
208
+ label="Status",
209
+ value="Idle",
210
+ interactive=False
 
 
 
 
 
 
 
 
 
 
211
  )
212
+
213
+ with gr.Tabs():
214
+ with gr.Tab("📄 Rendered"):
215
+ output_text = gr.Markdown(
216
+ value="*Extracted text will appear here...*",
217
+ height=600
218
+ )
219
+ with gr.Tab("📝 Raw Text"):
220
+ raw_output = gr.Textbox(
221
+ placeholder="Raw text will appear here...",
222
+ lines=25,
223
+ show_copy_button=True
224
+ )
225
 
226
+ # Event handlers
227
  submit_btn.click(
228
  fn=process_input,
229
+ inputs=[file_input, backend_selector, scale_slider, temperature, max_tokens_slider, num_pages],
230
+ outputs=[status_display, output_text, raw_output, page_info, rendered_image]
231
  )
232
 
233
  clear_btn.click(
234
+ fn=lambda: ("Idle", None, "*Extracted text will appear here...*", "", "", None),
235
+ outputs=[status_display, file_input, output_text, raw_output, page_info, rendered_image]
236
  )
237
 
238
 
239
  if __name__ == "__main__":
240
+ demo.launch()
backends/__init__.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Backend interface for LightOnOCR-1B inference.
3
+ Supports both PyTorch and GGUF backends.
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from typing import List, Tuple
8
+ from PIL import Image
9
+
10
+
11
+ class OCRBackend(ABC):
12
+ """Abstract base class for OCR backends."""
13
+
14
+ @abstractmethod
15
+ def load_model(self):
16
+ """Load the OCR model."""
17
+ pass
18
+
19
+ @abstractmethod
20
+ def process_image(self, image: Image.Image, temperature: float = 0.1) -> str:
21
+ """
22
+ Process a single image and return extracted text.
23
+
24
+ Args:
25
+ image: PIL Image to process
26
+ temperature: Sampling temperature (0 = greedy)
27
+
28
+ Returns:
29
+ Extracted text as string
30
+ """
31
+ pass
32
+
33
+ @abstractmethod
34
+ def get_backend_info(self) -> dict:
35
+ """Return backend information (name, device, memory usage, etc.)."""
36
+ pass
37
+
38
+
39
+ def get_available_backends() -> List[str]:
40
+ """Return list of available backend names."""
41
+ backends = ["pytorch"]
42
+
43
+ # Check for GGUF support (binary or python package)
44
+ from pathlib import Path
45
+ project_root = Path(__file__).parent.parent
46
+ cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
47
+
48
+ if cli_path.exists():
49
+ backends.append("gguf")
50
+ else:
51
+ # Fallback check for python package (though we prefer CLI now)
52
+ try:
53
+ import llama_cpp
54
+ backends.append("gguf")
55
+ except ImportError:
56
+ pass
57
+
58
+ return backends
59
+
60
+
61
+ def create_backend(backend_name: str) -> OCRBackend:
62
+ """
63
+ Factory function to create backend instance.
64
+
65
+ Args:
66
+ backend_name: "pytorch" or "gguf"
67
+
68
+ Returns:
69
+ OCRBackend instance
70
+ """
71
+ if backend_name == "pytorch":
72
+ from .pytorch_backend import PyTorchBackend
73
+ return PyTorchBackend()
74
+ elif backend_name == "gguf":
75
+ from .gguf_backend import GGUFBackend
76
+ return GGUFBackend()
77
+ else:
78
+ raise ValueError(f"Unknown backend: {backend_name}. Available: {get_available_backends()}")
backends/gguf_backend.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GGUF backend for LightOnOCR-1B using local llama-mtmd-cli binary.
3
+ """
4
+
5
+ import os
6
+ import io
7
+ import tempfile
8
+ import subprocess
9
+ from pathlib import Path
10
+ from PIL import Image
11
+ from typing import Optional
12
+
13
+ from . import OCRBackend
14
+
15
+
16
+ class GGUFBackend(OCRBackend):
17
+ """GGUF-based OCR backend using local llama-mtmd-cli binary."""
18
+
19
+ def __init__(self, model_path: Optional[str] = None, mmproj_path: Optional[str] = None):
20
+ """
21
+ Initialize GGUF backend.
22
+
23
+ Args:
24
+ model_path: Path to GGUF model file
25
+ mmproj_path: Path to mmproj file
26
+ """
27
+ self.model_path = model_path
28
+ self.mmproj_path = mmproj_path
29
+ self.cli_path = self._find_cli_binary()
30
+ self._auto_detect_files()
31
+
32
+ def _find_cli_binary(self) -> Optional[str]:
33
+ """Find the llama-mtmd-cli binary."""
34
+ # Check project root llama.cpp build
35
+ project_root = Path(__file__).parent.parent
36
+ cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
37
+ if cli_path.exists():
38
+ return str(cli_path)
39
+ return None
40
+
41
+ def _auto_detect_files(self):
42
+ """Try to find GGUF model and mmproj files."""
43
+ if self.model_path and Path(self.model_path).exists():
44
+ if not self.mmproj_path:
45
+ model_dir = Path(self.model_path).parent
46
+ for mmproj_file in model_dir.glob("*mmproj*.gguf"):
47
+ self.mmproj_path = str(mmproj_file)
48
+ print(f"Auto-detected mmproj: {self.mmproj_path}")
49
+ break
50
+ return
51
+
52
+ search_paths = [
53
+ Path.cwd() / "models",
54
+ Path.cwd() / "gguf_models",
55
+ ]
56
+
57
+ for search_path in search_paths:
58
+ if not search_path.exists():
59
+ continue
60
+ for gguf_file in search_path.rglob("*.gguf"):
61
+ if "lightonocr" in gguf_file.name.lower() and "mmproj" not in gguf_file.name.lower():
62
+ self.model_path = str(gguf_file)
63
+ print(f"Auto-detected model: {self.model_path}")
64
+ model_dir = gguf_file.parent
65
+ for mmproj_file in model_dir.glob("*mmproj*.gguf"):
66
+ self.mmproj_path = str(mmproj_file)
67
+ print(f"Auto-detected mmproj: {self.mmproj_path}")
68
+ break
69
+ break
70
+ if self.model_path:
71
+ break
72
+
73
+ def load_model(self):
74
+ """Verify model, mmproj and CLI binary exist."""
75
+ if not self.cli_path:
76
+ raise RuntimeError(
77
+ "llama-mtmd-cli binary not found.\n"
78
+ "Please build llama.cpp locally:\n"
79
+ " git clone https://github.com/ggerganov/llama.cpp\n"
80
+ " cd llama.cpp && mkdir build && cd build\n"
81
+ " cmake .. -DGGML_METAL=ON && cmake --build . --config Release"
82
+ )
83
+
84
+ if not self.model_path or not Path(self.model_path).exists():
85
+ raise ValueError("GGUF model not found. Run download_gguf_model.py")
86
+
87
+ if not self.mmproj_path or not Path(self.mmproj_path).exists():
88
+ raise ValueError("mmproj file not found. Run download_gguf_model.py")
89
+
90
+ print(f"GGUF Backend ready:")
91
+ print(f" CLI: {self.cli_path}")
92
+ print(f" Model: {self.model_path}")
93
+ print(f" Projector: {self.mmproj_path}")
94
+
95
+ def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
96
+ """Process image using llama-mtmd-cli."""
97
+ if not self.cli_path:
98
+ self.load_model()
99
+
100
+ # Save image to temp file
101
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
102
+ image.save(tmp_img.name)
103
+ tmp_img_path = tmp_img.name
104
+
105
+ try:
106
+ cmd = [
107
+ self.cli_path,
108
+ "-m", self.model_path,
109
+ "--mmproj", self.mmproj_path,
110
+ "--image", tmp_img_path,
111
+ "-p", "Extract all text from this image. Be precise and include all visible text.",
112
+ "--temp", str(temperature),
113
+ "--n-predict", str(max_tokens),
114
+ # "--log-disable" # Removed as it suppresses output
115
+ ]
116
+
117
+ # Run CLI
118
+ result = subprocess.run(cmd, capture_output=True, text=True)
119
+
120
+ if result.returncode != 0:
121
+ print(f"CLI Error: {result.stderr}")
122
+ raise RuntimeError(f"llama-mtmd-cli failed: {result.stderr}")
123
+
124
+ # stdout contains the generated text, stderr contains logs
125
+ return result.stdout.strip()
126
+
127
+ finally:
128
+ if os.path.exists(tmp_img_path):
129
+ os.unlink(tmp_img_path)
130
+
131
+ def get_backend_info(self) -> dict:
132
+ return {
133
+ "name": "GGUF (llama-mtmd-cli)",
134
+ "device": "Metal (via CLI)",
135
+ "model_path": self.model_path or "not found",
136
+ "mmproj_path": self.mmproj_path or "not found",
137
+ "cli_path": self.cli_path
138
+ }
backends/pytorch_backend.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PyTorch backend for LightOnOCR-1B.
3
+ Uses Mistral3ForConditionalGeneration with custom weight remapping.
4
+ """
5
+
6
+ import torch
7
+ import platform
8
+ from pathlib import Path
9
+ from PIL import Image
10
+ from transformers import AutoConfig, PixtralProcessor, Mistral3ForConditionalGeneration
11
+ from safetensors.torch import load_file
12
+ from huggingface_hub import hf_hub_download
13
+
14
+ from . import OCRBackend
15
+
16
+
17
+ class PyTorchBackend(OCRBackend):
18
+ """PyTorch-based OCR backend using transformers."""
19
+
20
+ def __init__(self):
21
+ self.model = None
22
+ self.processor = None
23
+ self.device = None
24
+ self.dtype = None
25
+ self.model_id = "lightonai/LightOnOCR-1B-1025"
26
+
27
+ def load_model(self):
28
+ """Load the PyTorch model with custom weight remapping."""
29
+ if self.model is not None:
30
+ return # Already loaded
31
+
32
+ print(f"Loading {self.model_id} (PyTorch backend)...")
33
+
34
+ # Load processor
35
+ self.processor = PixtralProcessor.from_pretrained(self.model_id, trust_remote_code=True)
36
+
37
+ # Instantiate model with config
38
+ config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
39
+ self.model = Mistral3ForConditionalGeneration(config)
40
+
41
+ # Download and remap weights
42
+ print(" Downloading and remapping weights...")
43
+ weights_path = hf_hub_download(repo_id=self.model_id, filename="model.safetensors")
44
+ state_dict = load_file(weights_path)
45
+
46
+ new_state_dict = {}
47
+ for k, v in state_dict.items():
48
+ new_key = k
49
+ if "vision_encoder" in k:
50
+ new_key = k.replace("vision_encoder", "vision_tower")
51
+ if "vision_projection" in k:
52
+ new_key = k.replace("vision_projection", "multi_modal_projector")
53
+ new_state_dict[new_key] = v
54
+
55
+ self.model.load_state_dict(new_state_dict, strict=False)
56
+
57
+ # Determine device
58
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
59
+ if platform.system() == "Darwin" and "arm" in platform.machine().lower():
60
+ self.device = "mps"
61
+
62
+ # MPS has issues with float16, use float32
63
+ if self.device == "mps":
64
+ self.dtype = torch.float32
65
+ else:
66
+ self.dtype = torch.float16 if self.device == "cuda" else torch.float32
67
+
68
+ self.model = self.model.to(device=self.device, dtype=self.dtype)
69
+ self.model.eval()
70
+
71
+ print(f" Model loaded on {self.device} ({self.dtype})")
72
+
73
+ def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
74
+ """Process image using PyTorch model."""
75
+ if self.model is None:
76
+ self.load_model()
77
+
78
+ messages = [
79
+ {
80
+ "role": "user",
81
+ "content": [
82
+ {"type": "image", "image": image},
83
+ {"type": "text", "text": "Extract all text from this image. Be precise and include all visible text."}
84
+ ]
85
+ }
86
+ ]
87
+
88
+ prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
89
+ inputs = self.processor(text=prompt, images=image, return_tensors="pt")
90
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
91
+
92
+ # Ensure pixel_values match model dtype (critical for MPS)
93
+ if 'pixel_values' in inputs:
94
+ inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
95
+
96
+ with torch.no_grad():
97
+ generated_ids = self.model.generate(
98
+ **inputs,
99
+ max_new_tokens=max_tokens,
100
+ temperature=temperature,
101
+ do_sample=temperature > 0,
102
+ pad_token_id=self.processor.tokenizer.eos_token_id
103
+ )
104
+
105
+ input_len = inputs['input_ids'].shape[1] if 'input_ids' in inputs else 0
106
+ new_tokens = generated_ids[:, input_len:] if generated_ids.shape[1] > input_len else generated_ids
107
+ generated_text = self.processor.batch_decode(new_tokens, skip_special_tokens=True)[0]
108
+
109
+ return generated_text.strip()
110
+
111
+ def get_backend_info(self) -> dict:
112
+ """Return backend information."""
113
+ return {
114
+ "name": "PyTorch",
115
+ "device": str(self.device) if self.device else "not loaded",
116
+ "dtype": str(self.dtype) if self.dtype else "not loaded",
117
+ "model_id": self.model_id,
118
+ "loaded": self.model is not None
119
+ }
docs/gguf_setup.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GGUF Backend Setup Guide
2
+
3
+ ## Quick Start (Recommended)
4
+
5
+ Since `llama-cpp-python` doesn't yet support LightOnOCR, we must build `llama.cpp` locally.
6
+
7
+ ### 1. Build llama.cpp locally
8
+
9
+ ```bash
10
+ # Clone repository
11
+ git clone https://github.com/ggerganov/llama.cpp
12
+ cd llama.cpp
13
+
14
+ # Create build directory
15
+ mkdir build && cd build
16
+
17
+ # Build with Metal support (MacOS)
18
+ cmake .. -DGGML_METAL=ON
19
+ cmake --build . --config Release -j 8
20
+
21
+ # Verify build
22
+ ./bin/llama-mtmd-cli --help
23
+ ```
24
+
25
+ ### 2. Download GGUF Model
26
+
27
+ ```bash
28
+ # Return to project root
29
+ cd ../../
30
+
31
+ # Run download script
32
+ python download_gguf_model.py
33
+ ```
34
+
35
+ ### 3. Use GGUF Backend
36
+
37
+ ```bash
38
+ # CLI
39
+ python ocr_cli.py document.pdf --backend gguf
40
+
41
+ # Gradio UI
42
+ python app.py
43
+ # Select "gguf" from backend dropdown
44
+ ```
45
+
46
+ ## Performance
47
+
48
+ The custom built `llama-mtmd-cli` provides incredible performance on Apple Silicon:
49
+
50
+ | Backend | Time per Page | Speedup |
51
+ |---------|---------------|---------|
52
+ | PyTorch (Original) | ~4 mins | 1x |
53
+ | PyTorch (Optimized) | ~40 sec | 6x |
54
+ | **GGUF (llama-mtmd-cli)** | **~3 sec** | **80x** ⭐ |
55
+
56
+ ## Troubleshooting
57
+
58
+ ### "llama-mtmd-cli binary not found"
59
+ Ensure you successfully built `llama.cpp` and the binary exists at `llama.cpp/build/bin/llama-mtmd-cli`.
60
+
61
+ ### "GGUF model not found"
62
+ Run `python download_gguf_model.py` to download the required model files.
docs/gguf_status.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GGUF Backend Status
2
+
3
+ ## Current Status: ⚠️ Not Yet Supported
4
+
5
+ The GGUF backend infrastructure is **fully implemented and ready**, but cannot be used yet due to a limitation in llama.cpp.
6
+
7
+ ### Issue
8
+
9
+ LightOnOCR-1B uses a custom multimodal projector type (`lightonocr`) that is not yet supported in the standard llama.cpp library:
10
+
11
+ ```
12
+ clip_init: failed to load model: load_hparams: unknown projector type: lightonocr
13
+ ```
14
+
15
+ ### What's Ready
16
+
17
+ ✅ llama-cpp-python installed with Metal support
18
+ ✅ GGUF Q8_0 model downloaded (767MB)
19
+ ✅ mmproj file downloaded (417MB)
20
+ ✅ Complete backend implementation (`backends/gguf_backend.py`)
21
+ ✅ CLI and UI integration
22
+
23
+ ### What's Needed
24
+
25
+ ❌ llama.cpp support for LightOnOCR projector type
26
+
27
+ ### Workaround Options
28
+
29
+ 1. **Wait for official support** - Monitor llama.cpp repository
30
+ 2. **Use PyTorch backend** - Fully functional, ~40s per page
31
+ 3. **Contribute to llama.cpp** - Add LightOnOCR projector support
32
+
33
+ ### When Will GGUF Work?
34
+
35
+ The GGUF backend will work automatically once llama.cpp adds support for the `lightonocr` projector type. No code changes will be needed in this project - just update llama-cpp-python:
36
+
37
+ ```bash
38
+ pip install --upgrade llama-cpp-python
39
+ ```
40
+
41
+ ### Alternative: Use PyTorch
42
+
43
+ The PyTorch backend is fully optimized and works well:
44
+
45
+ ```bash
46
+ # Recommended settings
47
+ python ocr_cli.py document.pdf --scale 1.0 --max-tokens 1024
48
+
49
+ # Result: ~40 seconds per page
50
+ ```
51
+
52
+ ## References
53
+
54
+ - [llama.cpp GitHub](https://github.com/ggerganov/llama.cpp)
55
+ - [LightOnOCR GGUF Models](https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF)
56
+ - [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
57
+
58
+ ## Monitoring
59
+
60
+ Check these for updates:
61
+ - llama.cpp issues/PRs mentioning LightOnOCR
62
+ - llama-cpp-python release notes
63
+ - LightOnOCR Hugging Face discussions
docs/performance_optimization.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Оптимізація швидкодії LightOnOCR-1B на M3 Max
2
+
3
+ ## Поточна ситуація
4
+ - **PyTorch на MPS**: ~4 хвилини на сторінку (дуже повільно)
5
+ - **Причина**: MPS backend значно повільніший за CUDA для трансформерів
6
+
7
+ ## Швидкі оптимізації (PyTorch)
8
+
9
+ ### 1. Зменшення max_tokens
10
+ ```python
11
+ # У backends/pytorch_backend.py, рядок ~95
12
+ generated_ids = self.model.generate(
13
+ **inputs,
14
+ max_new_tokens=1024, # Було 2048, зменшити до 512-1024
15
+ temperature=temperature,
16
+ do_sample=temperature > 0,
17
+ pad_token_id=self.processor.tokenizer.eos_token_id
18
+ )
19
+ ```
20
+
21
+ ### 2. Використання нижчої роздільної здатності
22
+ ```bash
23
+ # Замість scale=1.5, використовуйте scale=1.0
24
+ python ocr_cli.py document.pdf --scale 1.0
25
+ ```
26
+
27
+ ## Рекомендоване рішення: GGUF + llama.cpp
28
+
29
+ ### Чому GGUF швидший?
30
+ - Оптимізований для Apple Silicon (Metal)
31
+ - Квантизація (Q8_0) зменшує розмір і прискорює
32
+ - Спеціалізований inference engine
33
+
34
+ ### Встановлення
35
+
36
+ ```bash
37
+ # 1. Встановити llama-cpp-python з Metal support
38
+ CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
39
+
40
+ # 2. Завантажити GGUF модель
41
+ git lfs install
42
+ git clone https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF
43
+ # Або
44
+ git clone https://huggingface.co/Mungert/LightOnOCR-1B-1025-GGUF
45
+ ```
46
+
47
+ ### Використання
48
+ ```bash
49
+ # Після завантаження моделі
50
+ python ocr_cli.py document.pdf --backend gguf --model-path path/to/model.gguf
51
+ ```
52
+
53
+ ## Очікувані результати
54
+ - **PyTorch оптимізований**: ~2-3 хвилини на сторінку
55
+ - **GGUF Q8_0**: ~30-60 секунд на сторінку (орієнтовно)
56
+
57
+ ## Примітка
58
+ GGUF backend у поточній версії є заглушкою. Для повної підтримки потрібно:
59
+ 1. Завантажити GGUF модель
60
+ 2. Реалізувати vision model support у `backends/gguf_backend.py`
download_gguf_model.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download GGUF model and mmproj files for LightOnOCR-1B.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ def download_gguf_model():
11
+ """Download GGUF model using git lfs."""
12
+ models_dir = Path("models/lightonocr-gguf")
13
+ models_dir.mkdir(parents=True, exist_ok=True)
14
+
15
+ print("=" * 60)
16
+ print("LightOnOCR-1B GGUF Model Download")
17
+ print("=" * 60)
18
+ print()
19
+ print("This will download ~1-2GB of model files.")
20
+ print(f"Target directory: {models_dir.absolute()}")
21
+ print()
22
+
23
+ # Check if git lfs is installed
24
+ import subprocess
25
+ try:
26
+ result = subprocess.run(["git", "lfs", "version"], capture_output=True, text=True)
27
+ if result.returncode != 0:
28
+ print("ERROR: git-lfs not installed!")
29
+ print()
30
+ print("Install git-lfs first:")
31
+ print(" macOS: brew install git-lfs")
32
+ print(" Then run: git lfs install")
33
+ sys.exit(1)
34
+ except FileNotFoundError:
35
+ print("ERROR: git not found!")
36
+ sys.exit(1)
37
+
38
+ # Clone repository
39
+ repo_url = "https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF"
40
+
41
+ if (models_dir / ".git").exists():
42
+ print(f"Model directory already exists. Updating...")
43
+ os.chdir(models_dir)
44
+ subprocess.run(["git", "pull"], check=True)
45
+ else:
46
+ print(f"Cloning from {repo_url}...")
47
+ subprocess.run([
48
+ "git", "clone",
49
+ repo_url,
50
+ str(models_dir)
51
+ ], check=True)
52
+
53
+ print()
54
+ print("✓ Download complete!")
55
+ print()
56
+ print("Downloaded files:")
57
+ for gguf_file in models_dir.glob("*.gguf"):
58
+ size_mb = gguf_file.stat().st_size / (1024 * 1024)
59
+ print(f" - {gguf_file.name} ({size_mb:.1f} MB)")
60
+
61
+ print()
62
+ print("Recommended files for use:")
63
+ print(" Model: LightOnOCR-1B-1025-Q8_0.gguf (best quality)")
64
+ print(" mmproj: mmproj-Q8_0.gguf or mmproj-f16.gguf")
65
+ print()
66
+ print("Usage:")
67
+ print(f" python ocr_cli.py document.pdf --backend gguf")
68
+ print()
69
+
70
+ if __name__ == "__main__":
71
+ download_gguf_model()
download_model.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from huggingface_hub import HfApi, snapshot_download
8
+
9
+
10
+ def parse_args() -> argparse.Namespace:
11
+ parser = argparse.ArgumentParser(
12
+ description="Download a model repository from Hugging Face Hub."
13
+ )
14
+ parser.add_argument(
15
+ "model_id",
16
+ nargs="?",
17
+ default="lightonai/LightOnOCR-1B-1025",
18
+ help="Model repository to download (default: %(default)s)",
19
+ )
20
+ parser.add_argument(
21
+ "--revision",
22
+ default=None,
23
+ help="Specific git revision (branch/tag/commit) to download.",
24
+ )
25
+ parser.add_argument(
26
+ "--cache-dir",
27
+ default=None,
28
+ help="Cache directory where the model snapshot will be stored.",
29
+ )
30
+ parser.add_argument(
31
+ "--local-dir",
32
+ default=None,
33
+ help="Optional local directory to copy the snapshot into after download.",
34
+ )
35
+ parser.add_argument(
36
+ "--token",
37
+ default=None,
38
+ help="Hugging Face access token; defaults to HF_TOKEN or HUGGINGFACEHUB_API_TOKEN env vars.",
39
+ )
40
+ parser.add_argument(
41
+ "--allow-pattern",
42
+ action="append",
43
+ default=None,
44
+ help="File glob pattern(s) to include when downloading.",
45
+ )
46
+ parser.add_argument(
47
+ "--ignore-pattern",
48
+ action="append",
49
+ default=None,
50
+ help="File glob pattern(s) to exclude when downloading.",
51
+ )
52
+ parser.add_argument(
53
+ "--offline",
54
+ action="store_true",
55
+ help="Run in offline mode, using only the local cache.",
56
+ )
57
+ return parser.parse_args()
58
+
59
+
60
+ def resolve_token(user_token: str | None) -> str | None:
61
+ if user_token:
62
+ return user_token
63
+ return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
64
+
65
+
66
+ def ensure_auth(token: str | None) -> None:
67
+ if token:
68
+ return
69
+ try:
70
+ api = HfApi()
71
+ if api.whoami():
72
+ return
73
+ except Exception:
74
+ pass
75
+
76
+ raise RuntimeError(
77
+ "Hugging Face token not provided. Set HF_TOKEN or run `huggingface-cli login`."
78
+ )
79
+
80
+
81
+ def main() -> None:
82
+ args = parse_args()
83
+ token = resolve_token(args.token)
84
+
85
+ if not args.offline:
86
+ ensure_auth(token)
87
+
88
+ try:
89
+ snapshot_path = snapshot_download(
90
+ repo_id=args.model_id,
91
+ revision=args.revision,
92
+ cache_dir=args.cache_dir,
93
+ local_dir=args.local_dir,
94
+ allow_patterns=args.allow_pattern,
95
+ ignore_patterns=args.ignore_pattern,
96
+ token=token,
97
+ local_files_only=args.offline,
98
+ )
99
+ except Exception as exc:
100
+ print(f"Failed to download {args.model_id}: {exc}", file=sys.stderr)
101
+ sys.exit(1)
102
+
103
+ print(f"Model snapshot available at: {Path(snapshot_path).resolve()}")
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
llama.cpp ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit bd2a93d4753c4f00443f561ee039220283016ee8
ocr_cli.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCR CLI utility for LightOnOCR-1B with backend support.
4
+ Supports PyTorch and GGUF backends for flexible performance/quality trade-offs.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import argparse
10
+ import time
11
+ from pathlib import Path
12
+ from PIL import Image
13
+ import pypdfium2 as pdfium
14
+
15
+ # Add project root to path
16
+ sys.path.insert(0, str(Path(__file__).parent))
17
+
18
+ from backends import create_backend, get_available_backends
19
+
20
+
21
+ def render_pdf_page(page, scale=2.0):
22
+ """Render PDF page to PIL Image with configurable scale."""
23
+ return page.render(scale=scale, rev_byteorder=True).to_pil()
24
+
25
+
26
+ def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0,
27
+ temperature: float = 0.1, max_tokens: int = 1024):
28
+ """
29
+ Process PDF or image file with OCR.
30
+
31
+ Args:
32
+ input_path: Path to input file
33
+ backend_name: "pytorch" or "gguf"
34
+ scale: PDF rendering scale (lower = faster, higher = better quality)
35
+ temperature: Sampling temperature for generation
36
+ max_tokens: Maximum tokens to generate (lower = faster)
37
+ """
38
+ input_path = Path(input_path).resolve()
39
+ if not input_path.exists():
40
+ print(f"Error: File {input_path} not found.")
41
+ return
42
+
43
+ # Create backend
44
+ print(f"Initializing {backend_name} backend...")
45
+ backend = create_backend(backend_name)
46
+ backend.load_model()
47
+
48
+ info = backend.get_backend_info()
49
+ print(f"Backend info: {info}")
50
+
51
+ # Load images
52
+ images = []
53
+ if input_path.suffix.lower() == '.pdf':
54
+ print(f"\nProcessing PDF: {input_path.name}")
55
+ pdf = pdfium.PdfDocument(str(input_path))
56
+ num_pages = len(pdf)
57
+ print(f" Total pages: {num_pages}")
58
+ print(f" Rendering scale: {scale}x")
59
+
60
+ for i in range(num_pages):
61
+ print(f" Rendering page {i+1}/{num_pages}...", end=" ")
62
+ start = time.time()
63
+ images.append(render_pdf_page(pdf[i], scale=scale))
64
+ print(f"({time.time() - start:.1f}s)")
65
+ pdf.close()
66
+ else:
67
+ print(f"Processing image: {input_path.name}")
68
+ images = [Image.open(input_path)]
69
+
70
+ # Process with OCR
71
+ all_texts = []
72
+ total_start = time.time()
73
+
74
+ for i, img in enumerate(images):
75
+ print(f"\n OCR on page {i+1}/{len(images)}...", end=" ")
76
+ start = time.time()
77
+
78
+ try:
79
+ text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
80
+ elapsed = time.time() - start
81
+
82
+ all_texts.append(text)
83
+ print(f"({elapsed:.1f}s, {len(text)} chars)")
84
+ print(f" Preview: {text[:80]}...")
85
+ except Exception as e:
86
+ print(f"ERROR: {e}")
87
+ all_texts.append(f"[Error processing page {i+1}: {e}]")
88
+
89
+ # Save results
90
+ final_output = "\n\n".join(all_texts)
91
+ output_path = input_path.with_suffix('.md')
92
+ output_path.write_text(final_output, encoding='utf-8')
93
+
94
+ total_time = time.time() - total_start
95
+ print(f"\n✓ OCR Complete!")
96
+ print(f" Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)")
97
+ print(f" Output: {output_path}")
98
+
99
+
100
+ def main():
101
+ parser = argparse.ArgumentParser(
102
+ description="OCR utility for LightOnOCR-1B with backend selection",
103
+ formatter_class=argparse.RawDescriptionHelpFormatter,
104
+ epilog="""
105
+ Examples:
106
+ # Process with PyTorch (default, best quality)
107
+ python ocr_cli.py document.pdf
108
+
109
+ # Process with GGUF (faster, requires llama-cpp-python)
110
+ python ocr_cli.py document.pdf --backend gguf
111
+
112
+ # Fast processing with lower resolution
113
+ python ocr_cli.py document.pdf --scale 1.5
114
+
115
+ # High quality with higher resolution
116
+ python ocr_cli.py document.pdf --scale 3.0
117
+ """
118
+ )
119
+
120
+ parser.add_argument(
121
+ "input_file",
122
+ nargs="?",
123
+ default="test_docs/Xerox Scan_11062025151244_unident.pdf",
124
+ help="Input PDF or image file (default: test PDF)"
125
+ )
126
+
127
+ parser.add_argument(
128
+ "--backend",
129
+ choices=get_available_backends(),
130
+ default="pytorch",
131
+ help="Backend to use for inference (default: pytorch)"
132
+ )
133
+
134
+ parser.add_argument(
135
+ "--scale",
136
+ type=float,
137
+ default=2.0,
138
+ help="PDF rendering scale (default: 2.0, range: 1.0-4.0)"
139
+ )
140
+
141
+ parser.add_argument(
142
+ "--temperature",
143
+ type=float,
144
+ default=0.1,
145
+ help="Sampling temperature (default: 0.1, 0=greedy)"
146
+ )
147
+
148
+ parser.add_argument(
149
+ "--max-tokens",
150
+ type=int,
151
+ default=1024,
152
+ help="Maximum tokens to generate (default: 1024, range: 256-2048)"
153
+ )
154
+
155
+ args = parser.parse_args()
156
+
157
+ # Validate scale
158
+ if not 1.0 <= args.scale <= 4.0:
159
+ print("Warning: Scale should be between 1.0 and 4.0")
160
+
161
+ try:
162
+ process_file(
163
+ args.input_file,
164
+ backend_name=args.backend,
165
+ scale=args.scale,
166
+ temperature=args.temperature,
167
+ max_tokens=args.max_tokens
168
+ )
169
+ except Exception as e:
170
+ print(f"\nFatal error: {e}")
171
+ import traceback
172
+ traceback.print_exc()
173
+ sys.exit(1)
174
+
175
+
176
+ if __name__ == "__main__":
177
+ main()
requirements.txt CHANGED
@@ -1 +1,11 @@
1
- pypdfium2 == 4.30.0
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.42.0
2
+ pillow>=10.3.0,<11
3
+ pypdfium2==4.30.0
4
+ requests>=2.31.0,<3
5
+ huggingface_hub>=0.24.0
6
+ torch>=2.0.0
7
+ transformers>=4.36.0
8
+ accelerate>=0.26.0
9
+ safetensors>=0.4.0
10
+ # llama-cpp-python is optional for GGUF backend support (or use local build)
11
+ # llama-cpp-python>=0.3.0