Spaces:
Sleeping
Sleeping
feat: update ggml kernels, webui components, model templates, and build configurations
Browse files- .gitignore +24 -0
- README.md +61 -16
- app.py +157 -146
- backends/__init__.py +78 -0
- backends/gguf_backend.py +138 -0
- backends/pytorch_backend.py +119 -0
- docs/gguf_setup.md +62 -0
- docs/gguf_status.md +63 -0
- docs/performance_optimization.md +60 -0
- download_gguf_model.py +71 -0
- download_model.py +107 -0
- llama.cpp +1 -0
- ocr_cli.py +177 -0
- requirements.txt +11 -1
.gitignore
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python builds
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.so
|
| 5 |
+
|
| 6 |
+
# Virtual environments
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
|
| 10 |
+
# Editor settings
|
| 11 |
+
.DS_Store
|
| 12 |
+
|
| 13 |
+
# Environment files
|
| 14 |
+
.env
|
| 15 |
+
.env.local
|
| 16 |
+
|
| 17 |
+
# Model caches
|
| 18 |
+
models/
|
| 19 |
+
*.safetensors
|
| 20 |
+
|
| 21 |
+
# Test docs
|
| 22 |
+
test_docs/
|
| 23 |
+
help_docs/
|
| 24 |
+
|
README.md
CHANGED
|
@@ -1,16 +1,61 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
-
|
| 13 |
-
|
| 14 |
-
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LightOnOCR-1B Demo
|
| 2 |
+
|
| 3 |
+
High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon.
|
| 4 |
+
|
| 5 |
+
## 🚀 Performance
|
| 6 |
+
- **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
|
| 7 |
+
- **PyTorch Backend:** ~40 seconds per page
|
| 8 |
+
|
| 9 |
+
## Features
|
| 10 |
+
- 📄 PDF and image support
|
| 11 |
+
- 🔄 Seamless switching between GGUF and PyTorch backends
|
| 12 |
+
- 🎛️ Configurable resolution (scale) and token generation
|
| 13 |
+
- 🖥️ CLI and Gradio web interface
|
| 14 |
+
- 🍎 Full Metal/MPS support
|
| 15 |
+
|
| 16 |
+
## Quick Start
|
| 17 |
+
|
| 18 |
+
### 1. Prerequisites
|
| 19 |
+
- Python 3.10+
|
| 20 |
+
- `cmake` and `git`
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
pip install -r requirements.txt
|
| 24 |
+
pip install accelerate
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### 2. Setup GGUF (Highly Recommended)
|
| 28 |
+
See [GGUF Setup Guide](docs/gguf_setup.md).
|
| 29 |
+
|
| 30 |
+
1. Build `llama.cpp` locally:
|
| 31 |
+
```bash
|
| 32 |
+
git clone https://github.com/ggerganov/llama.cpp
|
| 33 |
+
cd llama.cpp && mkdir build && cd build
|
| 34 |
+
cmake .. -DGGML_METAL=ON && cmake --build . --config Release -j 8
|
| 35 |
+
cd ../..
|
| 36 |
+
```
|
| 37 |
+
2. Download model:
|
| 38 |
+
```bash
|
| 39 |
+
python download_gguf_model.py
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
### 3. Usage
|
| 43 |
+
|
| 44 |
+
**Command Line:**
|
| 45 |
+
```bash
|
| 46 |
+
# Fastest
|
| 47 |
+
python ocr_cli.py document.pdf --backend gguf
|
| 48 |
+
|
| 49 |
+
# High Quality
|
| 50 |
+
python ocr_cli.py document.pdf --backend gguf --scale 2.0
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
**Web Interface:**
|
| 54 |
+
```bash
|
| 55 |
+
python app.py
|
| 56 |
+
```
|
| 57 |
+
Open http://127.0.0.1:7860 and select **GGUF** backend.
|
| 58 |
+
|
| 59 |
+
## Documentation
|
| 60 |
+
- [GGUF Setup Guide](docs/gguf_setup.md)
|
| 61 |
+
- [Performance Optimization](docs/performance_optimization.md)
|
app.py
CHANGED
|
@@ -1,76 +1,92 @@
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
-
import
|
| 4 |
-
import base64
|
| 5 |
-
import requests
|
| 6 |
import gradio as gr
|
|
|
|
| 7 |
from PIL import Image
|
| 8 |
-
from io import BytesIO
|
| 9 |
import pypdfium2 as pdfium
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
-
def
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
-
def render_pdf_page(page,
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
pixel_height = height * scale
|
| 30 |
-
resize_factor = min(max_resolution / pixel_width, max_resolution / pixel_height)
|
| 31 |
-
target_scale = scale * resize_factor
|
| 32 |
-
return page.render(scale=target_scale, rev_byteorder=True).to_pil()
|
| 33 |
|
| 34 |
|
| 35 |
-
def process_pdf(pdf_path, num_pages=1):
|
|
|
|
| 36 |
pdf = pdfium.PdfDocument(pdf_path)
|
| 37 |
total_pages = len(pdf)
|
| 38 |
-
pages_to_process = min(num_pages, total_pages,
|
| 39 |
images = []
|
| 40 |
|
| 41 |
for i in range(pages_to_process):
|
| 42 |
page = pdf[i]
|
| 43 |
-
img = render_pdf_page(page)
|
| 44 |
images.append(img)
|
| 45 |
|
| 46 |
pdf.close()
|
| 47 |
return images, total_pages
|
| 48 |
|
| 49 |
|
| 50 |
-
def process_input(file_input, temperature, num_pages):
|
|
|
|
| 51 |
if file_input is None:
|
| 52 |
-
yield "Please upload an image or PDF first.", "", "", None
|
| 53 |
return
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
images_to_process = []
|
| 56 |
page_info = ""
|
| 57 |
display_image = None
|
| 58 |
-
|
| 59 |
-
file_path = file_input if isinstance(file_input, str) else file_input.name
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
try:
|
| 63 |
-
images_to_process, total_pages = process_pdf(file_path, num_pages)
|
| 64 |
if len(images_to_process) == 0:
|
| 65 |
-
yield "Error
|
| 66 |
return
|
| 67 |
display_image = images_to_process[0]
|
| 68 |
-
|
| 69 |
-
page_info = f"Processing page 1 of {total_pages}"
|
| 70 |
-
else:
|
| 71 |
-
page_info = f"Processing {len(images_to_process)} pages of {total_pages}"
|
| 72 |
except Exception as e:
|
| 73 |
-
yield f"Error processing PDF: {str(e)}", "", "", None
|
| 74 |
return
|
| 75 |
else:
|
| 76 |
try:
|
|
@@ -79,84 +95,44 @@ def process_input(file_input, temperature, num_pages):
|
|
| 79 |
display_image = img
|
| 80 |
page_info = "Processing image"
|
| 81 |
except Exception as e:
|
| 82 |
-
yield f"Error opening image: {str(e)}", "", "", None
|
| 83 |
return
|
| 84 |
-
|
| 85 |
-
content = [{"type": "text", "text": ""}]
|
| 86 |
-
|
| 87 |
-
for img in images_to_process:
|
| 88 |
-
try:
|
| 89 |
-
b64_image = image_to_base64(img)
|
| 90 |
-
content.append({
|
| 91 |
-
"type": "image_url",
|
| 92 |
-
"image_url": {"url": f"data:image/png;base64,{b64_image}"}
|
| 93 |
-
})
|
| 94 |
-
except Exception as e:
|
| 95 |
-
yield f"Error encoding image: {str(e)}", "", "", display_image
|
| 96 |
-
return
|
| 97 |
-
|
| 98 |
-
payload = {
|
| 99 |
-
"model": MODEL,
|
| 100 |
-
"messages": [
|
| 101 |
-
{
|
| 102 |
-
"role": "user",
|
| 103 |
-
"content": content
|
| 104 |
-
}
|
| 105 |
-
],
|
| 106 |
-
"temperature": temperature,
|
| 107 |
-
"stream": True
|
| 108 |
-
}
|
| 109 |
|
|
|
|
| 110 |
try:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
if 'choices' in chunk and len(chunk['choices']) > 0:
|
| 134 |
-
delta = chunk['choices'][0].get('delta', {})
|
| 135 |
-
content_delta = delta.get('content', '')
|
| 136 |
-
if content_delta:
|
| 137 |
-
accumulated_response += content_delta
|
| 138 |
-
if first_chunk:
|
| 139 |
-
yield accumulated_response, accumulated_response, page_info, display_image
|
| 140 |
-
first_chunk = False
|
| 141 |
-
else:
|
| 142 |
-
yield accumulated_response, accumulated_response, page_info, gr.update()
|
| 143 |
-
except json.JSONDecodeError:
|
| 144 |
-
continue
|
| 145 |
-
|
| 146 |
except Exception as e:
|
| 147 |
-
error_msg = f"Error: {str(e)}"
|
| 148 |
-
yield
|
| 149 |
|
| 150 |
|
| 151 |
-
|
|
|
|
| 152 |
gr.Markdown(
|
| 153 |
"""
|
| 154 |
-
# 📖
|
| 155 |
-
|
| 156 |
-
1. Upload an image or PDF
|
| 157 |
-
2. For PDFs: choose how many pages to process (1-5, default is 1)
|
| 158 |
-
3. Adjust temperature if needed
|
| 159 |
-
4. Click "Extract Text"
|
| 160 |
"""
|
| 161 |
)
|
| 162 |
|
|
@@ -168,62 +144,97 @@ with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
|
|
| 168 |
type="filepath"
|
| 169 |
)
|
| 170 |
rendered_image = gr.Image(
|
| 171 |
-
label="📄 Preview
|
| 172 |
type="pil",
|
| 173 |
-
height=
|
| 174 |
interactive=False
|
| 175 |
)
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
page_info = gr.Textbox(
|
| 185 |
label="Processing Info",
|
| 186 |
value="",
|
| 187 |
interactive=False
|
| 188 |
)
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
value=0.2,
|
| 193 |
-
step=0.05,
|
| 194 |
-
label="Temperature"
|
| 195 |
-
)
|
| 196 |
-
submit_btn = gr.Button("Extract Text", variant="primary")
|
| 197 |
-
clear_btn = gr.Button("Clear", variant="secondary")
|
| 198 |
|
| 199 |
with gr.Column(scale=2):
|
| 200 |
-
|
| 201 |
-
label="
|
| 202 |
-
value="
|
| 203 |
-
|
| 204 |
-
)
|
| 205 |
-
|
| 206 |
-
with gr.Row():
|
| 207 |
-
with gr.Column():
|
| 208 |
-
raw_output = gr.Textbox(
|
| 209 |
-
label="Raw Markdown Output",
|
| 210 |
-
placeholder="Raw text will appear here...",
|
| 211 |
-
lines=20,
|
| 212 |
-
max_lines=30,
|
| 213 |
-
show_copy_button=True
|
| 214 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
|
|
|
| 216 |
submit_btn.click(
|
| 217 |
fn=process_input,
|
| 218 |
-
inputs=[file_input, temperature, num_pages],
|
| 219 |
-
outputs=[output_text, raw_output, page_info, rendered_image]
|
| 220 |
)
|
| 221 |
|
| 222 |
clear_btn.click(
|
| 223 |
-
fn=lambda: (None, "", "", "", None
|
| 224 |
-
outputs=[file_input, output_text, raw_output, page_info, rendered_image
|
| 225 |
)
|
| 226 |
|
| 227 |
|
| 228 |
if __name__ == "__main__":
|
| 229 |
-
demo.launch()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gradio web interface for LightOnOCR-1B with backend support.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import os
|
| 7 |
+
import sys
|
|
|
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
+
from pathlib import Path
|
| 10 |
from PIL import Image
|
|
|
|
| 11 |
import pypdfium2 as pdfium
|
| 12 |
|
| 13 |
+
# Add project root to path
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 15 |
+
from backends import create_backend, get_available_backends
|
| 16 |
|
| 17 |
+
# Global backend
|
| 18 |
+
BACKEND = None
|
| 19 |
+
CURRENT_BACKEND_NAME = "pytorch"
|
| 20 |
|
| 21 |
|
| 22 |
+
def load_backend(backend_name="pytorch"):
|
| 23 |
+
"""Load OCR backend."""
|
| 24 |
+
global BACKEND, CURRENT_BACKEND_NAME
|
| 25 |
+
|
| 26 |
+
if BACKEND is None or CURRENT_BACKEND_NAME != backend_name:
|
| 27 |
+
print(f"Loading {backend_name} backend...")
|
| 28 |
+
BACKEND = create_backend(backend_name)
|
| 29 |
+
BACKEND.load_model()
|
| 30 |
+
CURRENT_BACKEND_NAME = backend_name
|
| 31 |
+
print(f"Backend loaded: {BACKEND.get_backend_info()}")
|
| 32 |
+
return BACKEND
|
| 33 |
|
| 34 |
|
| 35 |
+
def render_pdf_page(page, scale=2.0):
|
| 36 |
+
"""Render PDF page to PIL Image."""
|
| 37 |
+
return page.render(scale=scale, rev_byteorder=True).to_pil()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
+
def process_pdf(pdf_path, num_pages=1, scale=2.0):
|
| 41 |
+
"""Extract images from PDF."""
|
| 42 |
pdf = pdfium.PdfDocument(pdf_path)
|
| 43 |
total_pages = len(pdf)
|
| 44 |
+
pages_to_process = min(num_pages, total_pages, 10) # Max 10 pages
|
| 45 |
images = []
|
| 46 |
|
| 47 |
for i in range(pages_to_process):
|
| 48 |
page = pdf[i]
|
| 49 |
+
img = render_pdf_page(page, scale=scale)
|
| 50 |
images.append(img)
|
| 51 |
|
| 52 |
pdf.close()
|
| 53 |
return images, total_pages
|
| 54 |
|
| 55 |
|
| 56 |
+
def process_input(file_input, backend_name, scale, temperature, max_tokens, num_pages):
|
| 57 |
+
"""Process uploaded file with OCR."""
|
| 58 |
if file_input is None:
|
| 59 |
+
yield "Idle", "Please upload an image or PDF first.", "", "", None
|
| 60 |
return
|
| 61 |
+
|
| 62 |
+
# Load backend
|
| 63 |
+
try:
|
| 64 |
+
backend = load_backend(backend_name)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
error_msg = f"Error loading backend: {str(e)}"
|
| 67 |
+
yield "Error", error_msg, error_msg, "", None
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
images_to_process = []
|
| 71 |
page_info = ""
|
| 72 |
display_image = None
|
| 73 |
+
|
| 74 |
+
file_path = Path(file_input) if isinstance(file_input, str) else Path(file_input.name)
|
| 75 |
+
if not file_path.exists():
|
| 76 |
+
yield "Error", f"File not accessible: {file_path}", "", "", None
|
| 77 |
+
return
|
| 78 |
+
|
| 79 |
+
# Load images
|
| 80 |
+
if file_path.suffix.lower() == '.pdf':
|
| 81 |
try:
|
| 82 |
+
images_to_process, total_pages = process_pdf(str(file_path), num_pages, scale)
|
| 83 |
if len(images_to_process) == 0:
|
| 84 |
+
yield "Error", "Could not extract pages from PDF.", "", "", None
|
| 85 |
return
|
| 86 |
display_image = images_to_process[0]
|
| 87 |
+
page_info = f"Processing {len(images_to_process)} of {total_pages} pages"
|
|
|
|
|
|
|
|
|
|
| 88 |
except Exception as e:
|
| 89 |
+
yield "Error", f"Error processing PDF: {str(e)}", "", "", None
|
| 90 |
return
|
| 91 |
else:
|
| 92 |
try:
|
|
|
|
| 95 |
display_image = img
|
| 96 |
page_info = "Processing image"
|
| 97 |
except Exception as e:
|
| 98 |
+
yield "Error", f"Error opening image: {str(e)}", "", "", None
|
| 99 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
# Process with OCR
|
| 102 |
try:
|
| 103 |
+
yield "Processing...", "Processing images...", "", page_info, display_image
|
| 104 |
+
|
| 105 |
+
all_texts = []
|
| 106 |
+
for i, img in enumerate(images_to_process):
|
| 107 |
+
try:
|
| 108 |
+
print(f"Processing page {i+1}/{len(images_to_process)}...")
|
| 109 |
+
text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
|
| 110 |
+
all_texts.append(text.strip())
|
| 111 |
+
|
| 112 |
+
# Update progress
|
| 113 |
+
full_text = "\n\n---\n\n".join(all_texts)
|
| 114 |
+
yield "Processing...", full_text, full_text, page_info, display_image
|
| 115 |
+
except Exception as e:
|
| 116 |
+
error_msg = f"Error on page {i+1}: {str(e)}"
|
| 117 |
+
print(f"ERROR: {error_msg}")
|
| 118 |
+
all_texts.append(f"[{error_msg}]")
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
# Final result
|
| 122 |
+
final_text = "\n\n---\n\n".join(all_texts)
|
| 123 |
+
yield "Complete", final_text, final_text, page_info, display_image
|
| 124 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
except Exception as e:
|
| 126 |
+
error_msg = f"Error during processing: {str(e)}"
|
| 127 |
+
yield "Error", error_msg, "", page_info, display_image
|
| 128 |
|
| 129 |
|
| 130 |
+
# Create Gradio interface
|
| 131 |
+
with gr.Blocks(title="📖 LightOnOCR-1B Demo", theme=gr.themes.Soft()) as demo:
|
| 132 |
gr.Markdown(
|
| 133 |
"""
|
| 134 |
+
# 📖 LightOnOCR-1B - OCR Demo
|
| 135 |
+
Upload an image or PDF to extract text with configurable quality/speed settings.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
"""
|
| 137 |
)
|
| 138 |
|
|
|
|
| 144 |
type="filepath"
|
| 145 |
)
|
| 146 |
rendered_image = gr.Image(
|
| 147 |
+
label="📄 Preview",
|
| 148 |
type="pil",
|
| 149 |
+
height=300,
|
| 150 |
interactive=False
|
| 151 |
)
|
| 152 |
+
|
| 153 |
+
with gr.Accordion("⚙️ Settings", open=True):
|
| 154 |
+
backend_selector = gr.Radio(
|
| 155 |
+
choices=get_available_backends(),
|
| 156 |
+
value="pytorch",
|
| 157 |
+
label="Backend",
|
| 158 |
+
info="PyTorch: best quality | GGUF: faster (if available)"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
scale_slider = gr.Slider(
|
| 162 |
+
minimum=1.0,
|
| 163 |
+
maximum=3.0,
|
| 164 |
+
value=1.5,
|
| 165 |
+
step=0.5,
|
| 166 |
+
label="PDF Scale",
|
| 167 |
+
info="Higher = better quality, slower"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
max_tokens_slider = gr.Slider(
|
| 171 |
+
minimum=256,
|
| 172 |
+
maximum=2048,
|
| 173 |
+
value=1024,
|
| 174 |
+
step=256,
|
| 175 |
+
label="Max Tokens",
|
| 176 |
+
info="Lower = faster, may cut off long text"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
num_pages = gr.Slider(
|
| 180 |
+
minimum=1,
|
| 181 |
+
maximum=10,
|
| 182 |
+
value=1,
|
| 183 |
+
step=1,
|
| 184 |
+
label="PDF Pages",
|
| 185 |
+
info="Number of pages to process (max 10)"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
temperature = gr.Slider(
|
| 189 |
+
minimum=0.0,
|
| 190 |
+
maximum=1.0,
|
| 191 |
+
value=0.1,
|
| 192 |
+
step=0.05,
|
| 193 |
+
label="Temperature",
|
| 194 |
+
info="0 = deterministic"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
page_info = gr.Textbox(
|
| 198 |
label="Processing Info",
|
| 199 |
value="",
|
| 200 |
interactive=False
|
| 201 |
)
|
| 202 |
+
|
| 203 |
+
submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
|
| 204 |
+
clear_btn = gr.Button("🗑️ Clear", variant="secondary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
with gr.Column(scale=2):
|
| 207 |
+
status_display = gr.Textbox(
|
| 208 |
+
label="Status",
|
| 209 |
+
value="Idle",
|
| 210 |
+
interactive=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
)
|
| 212 |
+
|
| 213 |
+
with gr.Tabs():
|
| 214 |
+
with gr.Tab("📄 Rendered"):
|
| 215 |
+
output_text = gr.Markdown(
|
| 216 |
+
value="*Extracted text will appear here...*",
|
| 217 |
+
height=600
|
| 218 |
+
)
|
| 219 |
+
with gr.Tab("📝 Raw Text"):
|
| 220 |
+
raw_output = gr.Textbox(
|
| 221 |
+
placeholder="Raw text will appear here...",
|
| 222 |
+
lines=25,
|
| 223 |
+
show_copy_button=True
|
| 224 |
+
)
|
| 225 |
|
| 226 |
+
# Event handlers
|
| 227 |
submit_btn.click(
|
| 228 |
fn=process_input,
|
| 229 |
+
inputs=[file_input, backend_selector, scale_slider, temperature, max_tokens_slider, num_pages],
|
| 230 |
+
outputs=[status_display, output_text, raw_output, page_info, rendered_image]
|
| 231 |
)
|
| 232 |
|
| 233 |
clear_btn.click(
|
| 234 |
+
fn=lambda: ("Idle", None, "*Extracted text will appear here...*", "", "", None),
|
| 235 |
+
outputs=[status_display, file_input, output_text, raw_output, page_info, rendered_image]
|
| 236 |
)
|
| 237 |
|
| 238 |
|
| 239 |
if __name__ == "__main__":
|
| 240 |
+
demo.launch()
|
backends/__init__.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Backend interface for LightOnOCR-1B inference.
|
| 3 |
+
Supports both PyTorch and GGUF backends.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import List, Tuple
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class OCRBackend(ABC):
|
| 12 |
+
"""Abstract base class for OCR backends."""
|
| 13 |
+
|
| 14 |
+
@abstractmethod
|
| 15 |
+
def load_model(self):
|
| 16 |
+
"""Load the OCR model."""
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
@abstractmethod
|
| 20 |
+
def process_image(self, image: Image.Image, temperature: float = 0.1) -> str:
|
| 21 |
+
"""
|
| 22 |
+
Process a single image and return extracted text.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
image: PIL Image to process
|
| 26 |
+
temperature: Sampling temperature (0 = greedy)
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Extracted text as string
|
| 30 |
+
"""
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
def get_backend_info(self) -> dict:
|
| 35 |
+
"""Return backend information (name, device, memory usage, etc.)."""
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_available_backends() -> List[str]:
|
| 40 |
+
"""Return list of available backend names."""
|
| 41 |
+
backends = ["pytorch"]
|
| 42 |
+
|
| 43 |
+
# Check for GGUF support (binary or python package)
|
| 44 |
+
from pathlib import Path
|
| 45 |
+
project_root = Path(__file__).parent.parent
|
| 46 |
+
cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
|
| 47 |
+
|
| 48 |
+
if cli_path.exists():
|
| 49 |
+
backends.append("gguf")
|
| 50 |
+
else:
|
| 51 |
+
# Fallback check for python package (though we prefer CLI now)
|
| 52 |
+
try:
|
| 53 |
+
import llama_cpp
|
| 54 |
+
backends.append("gguf")
|
| 55 |
+
except ImportError:
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
return backends
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def create_backend(backend_name: str) -> OCRBackend:
|
| 62 |
+
"""
|
| 63 |
+
Factory function to create backend instance.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
backend_name: "pytorch" or "gguf"
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
OCRBackend instance
|
| 70 |
+
"""
|
| 71 |
+
if backend_name == "pytorch":
|
| 72 |
+
from .pytorch_backend import PyTorchBackend
|
| 73 |
+
return PyTorchBackend()
|
| 74 |
+
elif backend_name == "gguf":
|
| 75 |
+
from .gguf_backend import GGUFBackend
|
| 76 |
+
return GGUFBackend()
|
| 77 |
+
else:
|
| 78 |
+
raise ValueError(f"Unknown backend: {backend_name}. Available: {get_available_backends()}")
|
backends/gguf_backend.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GGUF backend for LightOnOCR-1B using local llama-mtmd-cli binary.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import io
|
| 7 |
+
import tempfile
|
| 8 |
+
import subprocess
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
from . import OCRBackend
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GGUFBackend(OCRBackend):
|
| 17 |
+
"""GGUF-based OCR backend using local llama-mtmd-cli binary."""
|
| 18 |
+
|
| 19 |
+
def __init__(self, model_path: Optional[str] = None, mmproj_path: Optional[str] = None):
|
| 20 |
+
"""
|
| 21 |
+
Initialize GGUF backend.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
model_path: Path to GGUF model file
|
| 25 |
+
mmproj_path: Path to mmproj file
|
| 26 |
+
"""
|
| 27 |
+
self.model_path = model_path
|
| 28 |
+
self.mmproj_path = mmproj_path
|
| 29 |
+
self.cli_path = self._find_cli_binary()
|
| 30 |
+
self._auto_detect_files()
|
| 31 |
+
|
| 32 |
+
def _find_cli_binary(self) -> Optional[str]:
|
| 33 |
+
"""Find the llama-mtmd-cli binary."""
|
| 34 |
+
# Check project root llama.cpp build
|
| 35 |
+
project_root = Path(__file__).parent.parent
|
| 36 |
+
cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
|
| 37 |
+
if cli_path.exists():
|
| 38 |
+
return str(cli_path)
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
def _auto_detect_files(self):
|
| 42 |
+
"""Try to find GGUF model and mmproj files."""
|
| 43 |
+
if self.model_path and Path(self.model_path).exists():
|
| 44 |
+
if not self.mmproj_path:
|
| 45 |
+
model_dir = Path(self.model_path).parent
|
| 46 |
+
for mmproj_file in model_dir.glob("*mmproj*.gguf"):
|
| 47 |
+
self.mmproj_path = str(mmproj_file)
|
| 48 |
+
print(f"Auto-detected mmproj: {self.mmproj_path}")
|
| 49 |
+
break
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
search_paths = [
|
| 53 |
+
Path.cwd() / "models",
|
| 54 |
+
Path.cwd() / "gguf_models",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
for search_path in search_paths:
|
| 58 |
+
if not search_path.exists():
|
| 59 |
+
continue
|
| 60 |
+
for gguf_file in search_path.rglob("*.gguf"):
|
| 61 |
+
if "lightonocr" in gguf_file.name.lower() and "mmproj" not in gguf_file.name.lower():
|
| 62 |
+
self.model_path = str(gguf_file)
|
| 63 |
+
print(f"Auto-detected model: {self.model_path}")
|
| 64 |
+
model_dir = gguf_file.parent
|
| 65 |
+
for mmproj_file in model_dir.glob("*mmproj*.gguf"):
|
| 66 |
+
self.mmproj_path = str(mmproj_file)
|
| 67 |
+
print(f"Auto-detected mmproj: {self.mmproj_path}")
|
| 68 |
+
break
|
| 69 |
+
break
|
| 70 |
+
if self.model_path:
|
| 71 |
+
break
|
| 72 |
+
|
| 73 |
+
def load_model(self):
|
| 74 |
+
"""Verify model, mmproj and CLI binary exist."""
|
| 75 |
+
if not self.cli_path:
|
| 76 |
+
raise RuntimeError(
|
| 77 |
+
"llama-mtmd-cli binary not found.\n"
|
| 78 |
+
"Please build llama.cpp locally:\n"
|
| 79 |
+
" git clone https://github.com/ggerganov/llama.cpp\n"
|
| 80 |
+
" cd llama.cpp && mkdir build && cd build\n"
|
| 81 |
+
" cmake .. -DGGML_METAL=ON && cmake --build . --config Release"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
if not self.model_path or not Path(self.model_path).exists():
|
| 85 |
+
raise ValueError("GGUF model not found. Run download_gguf_model.py")
|
| 86 |
+
|
| 87 |
+
if not self.mmproj_path or not Path(self.mmproj_path).exists():
|
| 88 |
+
raise ValueError("mmproj file not found. Run download_gguf_model.py")
|
| 89 |
+
|
| 90 |
+
print(f"GGUF Backend ready:")
|
| 91 |
+
print(f" CLI: {self.cli_path}")
|
| 92 |
+
print(f" Model: {self.model_path}")
|
| 93 |
+
print(f" Projector: {self.mmproj_path}")
|
| 94 |
+
|
| 95 |
+
def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
|
| 96 |
+
"""Process image using llama-mtmd-cli."""
|
| 97 |
+
if not self.cli_path:
|
| 98 |
+
self.load_model()
|
| 99 |
+
|
| 100 |
+
# Save image to temp file
|
| 101 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
|
| 102 |
+
image.save(tmp_img.name)
|
| 103 |
+
tmp_img_path = tmp_img.name
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
cmd = [
|
| 107 |
+
self.cli_path,
|
| 108 |
+
"-m", self.model_path,
|
| 109 |
+
"--mmproj", self.mmproj_path,
|
| 110 |
+
"--image", tmp_img_path,
|
| 111 |
+
"-p", "Extract all text from this image. Be precise and include all visible text.",
|
| 112 |
+
"--temp", str(temperature),
|
| 113 |
+
"--n-predict", str(max_tokens),
|
| 114 |
+
# "--log-disable" # Removed as it suppresses output
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
# Run CLI
|
| 118 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 119 |
+
|
| 120 |
+
if result.returncode != 0:
|
| 121 |
+
print(f"CLI Error: {result.stderr}")
|
| 122 |
+
raise RuntimeError(f"llama-mtmd-cli failed: {result.stderr}")
|
| 123 |
+
|
| 124 |
+
# stdout contains the generated text, stderr contains logs
|
| 125 |
+
return result.stdout.strip()
|
| 126 |
+
|
| 127 |
+
finally:
|
| 128 |
+
if os.path.exists(tmp_img_path):
|
| 129 |
+
os.unlink(tmp_img_path)
|
| 130 |
+
|
| 131 |
+
def get_backend_info(self) -> dict:
|
| 132 |
+
return {
|
| 133 |
+
"name": "GGUF (llama-mtmd-cli)",
|
| 134 |
+
"device": "Metal (via CLI)",
|
| 135 |
+
"model_path": self.model_path or "not found",
|
| 136 |
+
"mmproj_path": self.mmproj_path or "not found",
|
| 137 |
+
"cli_path": self.cli_path
|
| 138 |
+
}
|
backends/pytorch_backend.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PyTorch backend for LightOnOCR-1B.
|
| 3 |
+
Uses Mistral3ForConditionalGeneration with custom weight remapping.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import platform
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from PIL import Image
|
| 10 |
+
from transformers import AutoConfig, PixtralProcessor, Mistral3ForConditionalGeneration
|
| 11 |
+
from safetensors.torch import load_file
|
| 12 |
+
from huggingface_hub import hf_hub_download
|
| 13 |
+
|
| 14 |
+
from . import OCRBackend
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class PyTorchBackend(OCRBackend):
|
| 18 |
+
"""PyTorch-based OCR backend using transformers."""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.model = None
|
| 22 |
+
self.processor = None
|
| 23 |
+
self.device = None
|
| 24 |
+
self.dtype = None
|
| 25 |
+
self.model_id = "lightonai/LightOnOCR-1B-1025"
|
| 26 |
+
|
| 27 |
+
def load_model(self):
|
| 28 |
+
"""Load the PyTorch model with custom weight remapping."""
|
| 29 |
+
if self.model is not None:
|
| 30 |
+
return # Already loaded
|
| 31 |
+
|
| 32 |
+
print(f"Loading {self.model_id} (PyTorch backend)...")
|
| 33 |
+
|
| 34 |
+
# Load processor
|
| 35 |
+
self.processor = PixtralProcessor.from_pretrained(self.model_id, trust_remote_code=True)
|
| 36 |
+
|
| 37 |
+
# Instantiate model with config
|
| 38 |
+
config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
|
| 39 |
+
self.model = Mistral3ForConditionalGeneration(config)
|
| 40 |
+
|
| 41 |
+
# Download and remap weights
|
| 42 |
+
print(" Downloading and remapping weights...")
|
| 43 |
+
weights_path = hf_hub_download(repo_id=self.model_id, filename="model.safetensors")
|
| 44 |
+
state_dict = load_file(weights_path)
|
| 45 |
+
|
| 46 |
+
new_state_dict = {}
|
| 47 |
+
for k, v in state_dict.items():
|
| 48 |
+
new_key = k
|
| 49 |
+
if "vision_encoder" in k:
|
| 50 |
+
new_key = k.replace("vision_encoder", "vision_tower")
|
| 51 |
+
if "vision_projection" in k:
|
| 52 |
+
new_key = k.replace("vision_projection", "multi_modal_projector")
|
| 53 |
+
new_state_dict[new_key] = v
|
| 54 |
+
|
| 55 |
+
self.model.load_state_dict(new_state_dict, strict=False)
|
| 56 |
+
|
| 57 |
+
# Determine device
|
| 58 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 59 |
+
if platform.system() == "Darwin" and "arm" in platform.machine().lower():
|
| 60 |
+
self.device = "mps"
|
| 61 |
+
|
| 62 |
+
# MPS has issues with float16, use float32
|
| 63 |
+
if self.device == "mps":
|
| 64 |
+
self.dtype = torch.float32
|
| 65 |
+
else:
|
| 66 |
+
self.dtype = torch.float16 if self.device == "cuda" else torch.float32
|
| 67 |
+
|
| 68 |
+
self.model = self.model.to(device=self.device, dtype=self.dtype)
|
| 69 |
+
self.model.eval()
|
| 70 |
+
|
| 71 |
+
print(f" Model loaded on {self.device} ({self.dtype})")
|
| 72 |
+
|
| 73 |
+
def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
|
| 74 |
+
"""Process image using PyTorch model."""
|
| 75 |
+
if self.model is None:
|
| 76 |
+
self.load_model()
|
| 77 |
+
|
| 78 |
+
messages = [
|
| 79 |
+
{
|
| 80 |
+
"role": "user",
|
| 81 |
+
"content": [
|
| 82 |
+
{"type": "image", "image": image},
|
| 83 |
+
{"type": "text", "text": "Extract all text from this image. Be precise and include all visible text."}
|
| 84 |
+
]
|
| 85 |
+
}
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
| 89 |
+
inputs = self.processor(text=prompt, images=image, return_tensors="pt")
|
| 90 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 91 |
+
|
| 92 |
+
# Ensure pixel_values match model dtype (critical for MPS)
|
| 93 |
+
if 'pixel_values' in inputs:
|
| 94 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
|
| 95 |
+
|
| 96 |
+
with torch.no_grad():
|
| 97 |
+
generated_ids = self.model.generate(
|
| 98 |
+
**inputs,
|
| 99 |
+
max_new_tokens=max_tokens,
|
| 100 |
+
temperature=temperature,
|
| 101 |
+
do_sample=temperature > 0,
|
| 102 |
+
pad_token_id=self.processor.tokenizer.eos_token_id
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
input_len = inputs['input_ids'].shape[1] if 'input_ids' in inputs else 0
|
| 106 |
+
new_tokens = generated_ids[:, input_len:] if generated_ids.shape[1] > input_len else generated_ids
|
| 107 |
+
generated_text = self.processor.batch_decode(new_tokens, skip_special_tokens=True)[0]
|
| 108 |
+
|
| 109 |
+
return generated_text.strip()
|
| 110 |
+
|
| 111 |
+
def get_backend_info(self) -> dict:
|
| 112 |
+
"""Return backend information."""
|
| 113 |
+
return {
|
| 114 |
+
"name": "PyTorch",
|
| 115 |
+
"device": str(self.device) if self.device else "not loaded",
|
| 116 |
+
"dtype": str(self.dtype) if self.dtype else "not loaded",
|
| 117 |
+
"model_id": self.model_id,
|
| 118 |
+
"loaded": self.model is not None
|
| 119 |
+
}
|
docs/gguf_setup.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GGUF Backend Setup Guide
|
| 2 |
+
|
| 3 |
+
## Quick Start (Recommended)
|
| 4 |
+
|
| 5 |
+
Since `llama-cpp-python` doesn't yet support LightOnOCR, we must build `llama.cpp` locally.
|
| 6 |
+
|
| 7 |
+
### 1. Build llama.cpp locally
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
# Clone repository
|
| 11 |
+
git clone https://github.com/ggerganov/llama.cpp
|
| 12 |
+
cd llama.cpp
|
| 13 |
+
|
| 14 |
+
# Create build directory
|
| 15 |
+
mkdir build && cd build
|
| 16 |
+
|
| 17 |
+
# Build with Metal support (MacOS)
|
| 18 |
+
cmake .. -DGGML_METAL=ON
|
| 19 |
+
cmake --build . --config Release -j 8
|
| 20 |
+
|
| 21 |
+
# Verify build
|
| 22 |
+
./bin/llama-mtmd-cli --help
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
### 2. Download GGUF Model
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
# Return to project root
|
| 29 |
+
cd ../../
|
| 30 |
+
|
| 31 |
+
# Run download script
|
| 32 |
+
python download_gguf_model.py
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### 3. Use GGUF Backend
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
# CLI
|
| 39 |
+
python ocr_cli.py document.pdf --backend gguf
|
| 40 |
+
|
| 41 |
+
# Gradio UI
|
| 42 |
+
python app.py
|
| 43 |
+
# Select "gguf" from backend dropdown
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## Performance
|
| 47 |
+
|
| 48 |
+
The custom built `llama-mtmd-cli` provides incredible performance on Apple Silicon:
|
| 49 |
+
|
| 50 |
+
| Backend | Time per Page | Speedup |
|
| 51 |
+
|---------|---------------|---------|
|
| 52 |
+
| PyTorch (Original) | ~4 mins | 1x |
|
| 53 |
+
| PyTorch (Optimized) | ~40 sec | 6x |
|
| 54 |
+
| **GGUF (llama-mtmd-cli)** | **~3 sec** | **80x** ⭐ |
|
| 55 |
+
|
| 56 |
+
## Troubleshooting
|
| 57 |
+
|
| 58 |
+
### "llama-mtmd-cli binary not found"
|
| 59 |
+
Ensure you successfully built `llama.cpp` and the binary exists at `llama.cpp/build/bin/llama-mtmd-cli`.
|
| 60 |
+
|
| 61 |
+
### "GGUF model not found"
|
| 62 |
+
Run `python download_gguf_model.py` to download the required model files.
|
docs/gguf_status.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GGUF Backend Status
|
| 2 |
+
|
| 3 |
+
## Current Status: ⚠️ Not Yet Supported
|
| 4 |
+
|
| 5 |
+
The GGUF backend infrastructure is **fully implemented and ready**, but cannot be used yet due to a limitation in llama.cpp.
|
| 6 |
+
|
| 7 |
+
### Issue
|
| 8 |
+
|
| 9 |
+
LightOnOCR-1B uses a custom multimodal projector type (`lightonocr`) that is not yet supported in the standard llama.cpp library:
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
clip_init: failed to load model: load_hparams: unknown projector type: lightonocr
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
### What's Ready
|
| 16 |
+
|
| 17 |
+
✅ llama-cpp-python installed with Metal support
|
| 18 |
+
✅ GGUF Q8_0 model downloaded (767MB)
|
| 19 |
+
✅ mmproj file downloaded (417MB)
|
| 20 |
+
✅ Complete backend implementation (`backends/gguf_backend.py`)
|
| 21 |
+
✅ CLI and UI integration
|
| 22 |
+
|
| 23 |
+
### What's Needed
|
| 24 |
+
|
| 25 |
+
❌ llama.cpp support for LightOnOCR projector type
|
| 26 |
+
|
| 27 |
+
### Workaround Options
|
| 28 |
+
|
| 29 |
+
1. **Wait for official support** - Monitor llama.cpp repository
|
| 30 |
+
2. **Use PyTorch backend** - Fully functional, ~40s per page
|
| 31 |
+
3. **Contribute to llama.cpp** - Add LightOnOCR projector support
|
| 32 |
+
|
| 33 |
+
### When Will GGUF Work?
|
| 34 |
+
|
| 35 |
+
The GGUF backend will work automatically once llama.cpp adds support for the `lightonocr` projector type. No code changes will be needed in this project - just update llama-cpp-python:
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
pip install --upgrade llama-cpp-python
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Alternative: Use PyTorch
|
| 42 |
+
|
| 43 |
+
The PyTorch backend is fully optimized and works well:
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
# Recommended settings
|
| 47 |
+
python ocr_cli.py document.pdf --scale 1.0 --max-tokens 1024
|
| 48 |
+
|
| 49 |
+
# Result: ~40 seconds per page
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## References
|
| 53 |
+
|
| 54 |
+
- [llama.cpp GitHub](https://github.com/ggerganov/llama.cpp)
|
| 55 |
+
- [LightOnOCR GGUF Models](https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF)
|
| 56 |
+
- [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
| 57 |
+
|
| 58 |
+
## Monitoring
|
| 59 |
+
|
| 60 |
+
Check these for updates:
|
| 61 |
+
- llama.cpp issues/PRs mentioning LightOnOCR
|
| 62 |
+
- llama-cpp-python release notes
|
| 63 |
+
- LightOnOCR Hugging Face discussions
|
docs/performance_optimization.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Оптимізація швидкодії LightOnOCR-1B на M3 Max
|
| 2 |
+
|
| 3 |
+
## Поточна ситуація
|
| 4 |
+
- **PyTorch на MPS**: ~4 хвилини на сторінку (дуже повільно)
|
| 5 |
+
- **Причина**: MPS backend значно повільніший за CUDA для трансформерів
|
| 6 |
+
|
| 7 |
+
## Швидкі оптимізації (PyTorch)
|
| 8 |
+
|
| 9 |
+
### 1. Зменшення max_tokens
|
| 10 |
+
```python
|
| 11 |
+
# У backends/pytorch_backend.py, рядок ~95
|
| 12 |
+
generated_ids = self.model.generate(
|
| 13 |
+
**inputs,
|
| 14 |
+
max_new_tokens=1024, # Було 2048, зменшити до 512-1024
|
| 15 |
+
temperature=temperature,
|
| 16 |
+
do_sample=temperature > 0,
|
| 17 |
+
pad_token_id=self.processor.tokenizer.eos_token_id
|
| 18 |
+
)
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### 2. Використання нижчої роздільної здатності
|
| 22 |
+
```bash
|
| 23 |
+
# Замість scale=1.5, використовуйте scale=1.0
|
| 24 |
+
python ocr_cli.py document.pdf --scale 1.0
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Рекомендоване рішення: GGUF + llama.cpp
|
| 28 |
+
|
| 29 |
+
### Чому GGUF швидший?
|
| 30 |
+
- Оптимізований для Apple Silicon (Metal)
|
| 31 |
+
- Квантизація (Q8_0) зменшує розмір і прискорює
|
| 32 |
+
- Спеціалізований inference engine
|
| 33 |
+
|
| 34 |
+
### Встановлення
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
# 1. Встановити llama-cpp-python з Metal support
|
| 38 |
+
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
|
| 39 |
+
|
| 40 |
+
# 2. Завантажити GGUF модель
|
| 41 |
+
git lfs install
|
| 42 |
+
git clone https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF
|
| 43 |
+
# Або
|
| 44 |
+
git clone https://huggingface.co/Mungert/LightOnOCR-1B-1025-GGUF
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### Використання
|
| 48 |
+
```bash
|
| 49 |
+
# Після завантаження моделі
|
| 50 |
+
python ocr_cli.py document.pdf --backend gguf --model-path path/to/model.gguf
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Очікувані результати
|
| 54 |
+
- **PyTorch оптимізований**: ~2-3 хвилини на сторінку
|
| 55 |
+
- **GGUF Q8_0**: ~30-60 секунд на сторінку (орієнтовно)
|
| 56 |
+
|
| 57 |
+
## Примітка
|
| 58 |
+
GGUF backend у поточній версії є заглушкою. Для повної підтримки потрібно:
|
| 59 |
+
1. Завантажити GGUF модель
|
| 60 |
+
2. Реалізувати vision model support у `backends/gguf_backend.py`
|
download_gguf_model.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Download GGUF model and mmproj files for LightOnOCR-1B.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
def download_gguf_model():
|
| 11 |
+
"""Download GGUF model using git lfs."""
|
| 12 |
+
models_dir = Path("models/lightonocr-gguf")
|
| 13 |
+
models_dir.mkdir(parents=True, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
print("=" * 60)
|
| 16 |
+
print("LightOnOCR-1B GGUF Model Download")
|
| 17 |
+
print("=" * 60)
|
| 18 |
+
print()
|
| 19 |
+
print("This will download ~1-2GB of model files.")
|
| 20 |
+
print(f"Target directory: {models_dir.absolute()}")
|
| 21 |
+
print()
|
| 22 |
+
|
| 23 |
+
# Check if git lfs is installed
|
| 24 |
+
import subprocess
|
| 25 |
+
try:
|
| 26 |
+
result = subprocess.run(["git", "lfs", "version"], capture_output=True, text=True)
|
| 27 |
+
if result.returncode != 0:
|
| 28 |
+
print("ERROR: git-lfs not installed!")
|
| 29 |
+
print()
|
| 30 |
+
print("Install git-lfs first:")
|
| 31 |
+
print(" macOS: brew install git-lfs")
|
| 32 |
+
print(" Then run: git lfs install")
|
| 33 |
+
sys.exit(1)
|
| 34 |
+
except FileNotFoundError:
|
| 35 |
+
print("ERROR: git not found!")
|
| 36 |
+
sys.exit(1)
|
| 37 |
+
|
| 38 |
+
# Clone repository
|
| 39 |
+
repo_url = "https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF"
|
| 40 |
+
|
| 41 |
+
if (models_dir / ".git").exists():
|
| 42 |
+
print(f"Model directory already exists. Updating...")
|
| 43 |
+
os.chdir(models_dir)
|
| 44 |
+
subprocess.run(["git", "pull"], check=True)
|
| 45 |
+
else:
|
| 46 |
+
print(f"Cloning from {repo_url}...")
|
| 47 |
+
subprocess.run([
|
| 48 |
+
"git", "clone",
|
| 49 |
+
repo_url,
|
| 50 |
+
str(models_dir)
|
| 51 |
+
], check=True)
|
| 52 |
+
|
| 53 |
+
print()
|
| 54 |
+
print("✓ Download complete!")
|
| 55 |
+
print()
|
| 56 |
+
print("Downloaded files:")
|
| 57 |
+
for gguf_file in models_dir.glob("*.gguf"):
|
| 58 |
+
size_mb = gguf_file.stat().st_size / (1024 * 1024)
|
| 59 |
+
print(f" - {gguf_file.name} ({size_mb:.1f} MB)")
|
| 60 |
+
|
| 61 |
+
print()
|
| 62 |
+
print("Recommended files for use:")
|
| 63 |
+
print(" Model: LightOnOCR-1B-1025-Q8_0.gguf (best quality)")
|
| 64 |
+
print(" mmproj: mmproj-Q8_0.gguf or mmproj-f16.gguf")
|
| 65 |
+
print()
|
| 66 |
+
print("Usage:")
|
| 67 |
+
print(f" python ocr_cli.py document.pdf --backend gguf")
|
| 68 |
+
print()
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
download_gguf_model()
|
download_model.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from huggingface_hub import HfApi, snapshot_download
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def parse_args() -> argparse.Namespace:
|
| 11 |
+
parser = argparse.ArgumentParser(
|
| 12 |
+
description="Download a model repository from Hugging Face Hub."
|
| 13 |
+
)
|
| 14 |
+
parser.add_argument(
|
| 15 |
+
"model_id",
|
| 16 |
+
nargs="?",
|
| 17 |
+
default="lightonai/LightOnOCR-1B-1025",
|
| 18 |
+
help="Model repository to download (default: %(default)s)",
|
| 19 |
+
)
|
| 20 |
+
parser.add_argument(
|
| 21 |
+
"--revision",
|
| 22 |
+
default=None,
|
| 23 |
+
help="Specific git revision (branch/tag/commit) to download.",
|
| 24 |
+
)
|
| 25 |
+
parser.add_argument(
|
| 26 |
+
"--cache-dir",
|
| 27 |
+
default=None,
|
| 28 |
+
help="Cache directory where the model snapshot will be stored.",
|
| 29 |
+
)
|
| 30 |
+
parser.add_argument(
|
| 31 |
+
"--local-dir",
|
| 32 |
+
default=None,
|
| 33 |
+
help="Optional local directory to copy the snapshot into after download.",
|
| 34 |
+
)
|
| 35 |
+
parser.add_argument(
|
| 36 |
+
"--token",
|
| 37 |
+
default=None,
|
| 38 |
+
help="Hugging Face access token; defaults to HF_TOKEN or HUGGINGFACEHUB_API_TOKEN env vars.",
|
| 39 |
+
)
|
| 40 |
+
parser.add_argument(
|
| 41 |
+
"--allow-pattern",
|
| 42 |
+
action="append",
|
| 43 |
+
default=None,
|
| 44 |
+
help="File glob pattern(s) to include when downloading.",
|
| 45 |
+
)
|
| 46 |
+
parser.add_argument(
|
| 47 |
+
"--ignore-pattern",
|
| 48 |
+
action="append",
|
| 49 |
+
default=None,
|
| 50 |
+
help="File glob pattern(s) to exclude when downloading.",
|
| 51 |
+
)
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--offline",
|
| 54 |
+
action="store_true",
|
| 55 |
+
help="Run in offline mode, using only the local cache.",
|
| 56 |
+
)
|
| 57 |
+
return parser.parse_args()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def resolve_token(user_token: str | None) -> str | None:
|
| 61 |
+
if user_token:
|
| 62 |
+
return user_token
|
| 63 |
+
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def ensure_auth(token: str | None) -> None:
|
| 67 |
+
if token:
|
| 68 |
+
return
|
| 69 |
+
try:
|
| 70 |
+
api = HfApi()
|
| 71 |
+
if api.whoami():
|
| 72 |
+
return
|
| 73 |
+
except Exception:
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
raise RuntimeError(
|
| 77 |
+
"Hugging Face token not provided. Set HF_TOKEN or run `huggingface-cli login`."
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main() -> None:
|
| 82 |
+
args = parse_args()
|
| 83 |
+
token = resolve_token(args.token)
|
| 84 |
+
|
| 85 |
+
if not args.offline:
|
| 86 |
+
ensure_auth(token)
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
snapshot_path = snapshot_download(
|
| 90 |
+
repo_id=args.model_id,
|
| 91 |
+
revision=args.revision,
|
| 92 |
+
cache_dir=args.cache_dir,
|
| 93 |
+
local_dir=args.local_dir,
|
| 94 |
+
allow_patterns=args.allow_pattern,
|
| 95 |
+
ignore_patterns=args.ignore_pattern,
|
| 96 |
+
token=token,
|
| 97 |
+
local_files_only=args.offline,
|
| 98 |
+
)
|
| 99 |
+
except Exception as exc:
|
| 100 |
+
print(f"Failed to download {args.model_id}: {exc}", file=sys.stderr)
|
| 101 |
+
sys.exit(1)
|
| 102 |
+
|
| 103 |
+
print(f"Model snapshot available at: {Path(snapshot_path).resolve()}")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
main()
|
llama.cpp
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit bd2a93d4753c4f00443f561ee039220283016ee8
|
ocr_cli.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OCR CLI utility for LightOnOCR-1B with backend support.
|
| 4 |
+
Supports PyTorch and GGUF backends for flexible performance/quality trade-offs.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import argparse
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import pypdfium2 as pdfium
|
| 14 |
+
|
| 15 |
+
# Add project root to path
|
| 16 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 17 |
+
|
| 18 |
+
from backends import create_backend, get_available_backends
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def render_pdf_page(page, scale=2.0):
|
| 22 |
+
"""Render PDF page to PIL Image with configurable scale."""
|
| 23 |
+
return page.render(scale=scale, rev_byteorder=True).to_pil()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0,
|
| 27 |
+
temperature: float = 0.1, max_tokens: int = 1024):
|
| 28 |
+
"""
|
| 29 |
+
Process PDF or image file with OCR.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
input_path: Path to input file
|
| 33 |
+
backend_name: "pytorch" or "gguf"
|
| 34 |
+
scale: PDF rendering scale (lower = faster, higher = better quality)
|
| 35 |
+
temperature: Sampling temperature for generation
|
| 36 |
+
max_tokens: Maximum tokens to generate (lower = faster)
|
| 37 |
+
"""
|
| 38 |
+
input_path = Path(input_path).resolve()
|
| 39 |
+
if not input_path.exists():
|
| 40 |
+
print(f"Error: File {input_path} not found.")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# Create backend
|
| 44 |
+
print(f"Initializing {backend_name} backend...")
|
| 45 |
+
backend = create_backend(backend_name)
|
| 46 |
+
backend.load_model()
|
| 47 |
+
|
| 48 |
+
info = backend.get_backend_info()
|
| 49 |
+
print(f"Backend info: {info}")
|
| 50 |
+
|
| 51 |
+
# Load images
|
| 52 |
+
images = []
|
| 53 |
+
if input_path.suffix.lower() == '.pdf':
|
| 54 |
+
print(f"\nProcessing PDF: {input_path.name}")
|
| 55 |
+
pdf = pdfium.PdfDocument(str(input_path))
|
| 56 |
+
num_pages = len(pdf)
|
| 57 |
+
print(f" Total pages: {num_pages}")
|
| 58 |
+
print(f" Rendering scale: {scale}x")
|
| 59 |
+
|
| 60 |
+
for i in range(num_pages):
|
| 61 |
+
print(f" Rendering page {i+1}/{num_pages}...", end=" ")
|
| 62 |
+
start = time.time()
|
| 63 |
+
images.append(render_pdf_page(pdf[i], scale=scale))
|
| 64 |
+
print(f"({time.time() - start:.1f}s)")
|
| 65 |
+
pdf.close()
|
| 66 |
+
else:
|
| 67 |
+
print(f"Processing image: {input_path.name}")
|
| 68 |
+
images = [Image.open(input_path)]
|
| 69 |
+
|
| 70 |
+
# Process with OCR
|
| 71 |
+
all_texts = []
|
| 72 |
+
total_start = time.time()
|
| 73 |
+
|
| 74 |
+
for i, img in enumerate(images):
|
| 75 |
+
print(f"\n OCR on page {i+1}/{len(images)}...", end=" ")
|
| 76 |
+
start = time.time()
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
|
| 80 |
+
elapsed = time.time() - start
|
| 81 |
+
|
| 82 |
+
all_texts.append(text)
|
| 83 |
+
print(f"({elapsed:.1f}s, {len(text)} chars)")
|
| 84 |
+
print(f" Preview: {text[:80]}...")
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"ERROR: {e}")
|
| 87 |
+
all_texts.append(f"[Error processing page {i+1}: {e}]")
|
| 88 |
+
|
| 89 |
+
# Save results
|
| 90 |
+
final_output = "\n\n".join(all_texts)
|
| 91 |
+
output_path = input_path.with_suffix('.md')
|
| 92 |
+
output_path.write_text(final_output, encoding='utf-8')
|
| 93 |
+
|
| 94 |
+
total_time = time.time() - total_start
|
| 95 |
+
print(f"\n✓ OCR Complete!")
|
| 96 |
+
print(f" Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)")
|
| 97 |
+
print(f" Output: {output_path}")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def main():
|
| 101 |
+
parser = argparse.ArgumentParser(
|
| 102 |
+
description="OCR utility for LightOnOCR-1B with backend selection",
|
| 103 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 104 |
+
epilog="""
|
| 105 |
+
Examples:
|
| 106 |
+
# Process with PyTorch (default, best quality)
|
| 107 |
+
python ocr_cli.py document.pdf
|
| 108 |
+
|
| 109 |
+
# Process with GGUF (faster, requires llama-cpp-python)
|
| 110 |
+
python ocr_cli.py document.pdf --backend gguf
|
| 111 |
+
|
| 112 |
+
# Fast processing with lower resolution
|
| 113 |
+
python ocr_cli.py document.pdf --scale 1.5
|
| 114 |
+
|
| 115 |
+
# High quality with higher resolution
|
| 116 |
+
python ocr_cli.py document.pdf --scale 3.0
|
| 117 |
+
"""
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
parser.add_argument(
|
| 121 |
+
"input_file",
|
| 122 |
+
nargs="?",
|
| 123 |
+
default="test_docs/Xerox Scan_11062025151244_unident.pdf",
|
| 124 |
+
help="Input PDF or image file (default: test PDF)"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
parser.add_argument(
|
| 128 |
+
"--backend",
|
| 129 |
+
choices=get_available_backends(),
|
| 130 |
+
default="pytorch",
|
| 131 |
+
help="Backend to use for inference (default: pytorch)"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
parser.add_argument(
|
| 135 |
+
"--scale",
|
| 136 |
+
type=float,
|
| 137 |
+
default=2.0,
|
| 138 |
+
help="PDF rendering scale (default: 2.0, range: 1.0-4.0)"
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
parser.add_argument(
|
| 142 |
+
"--temperature",
|
| 143 |
+
type=float,
|
| 144 |
+
default=0.1,
|
| 145 |
+
help="Sampling temperature (default: 0.1, 0=greedy)"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
parser.add_argument(
|
| 149 |
+
"--max-tokens",
|
| 150 |
+
type=int,
|
| 151 |
+
default=1024,
|
| 152 |
+
help="Maximum tokens to generate (default: 1024, range: 256-2048)"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
args = parser.parse_args()
|
| 156 |
+
|
| 157 |
+
# Validate scale
|
| 158 |
+
if not 1.0 <= args.scale <= 4.0:
|
| 159 |
+
print("Warning: Scale should be between 1.0 and 4.0")
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
process_file(
|
| 163 |
+
args.input_file,
|
| 164 |
+
backend_name=args.backend,
|
| 165 |
+
scale=args.scale,
|
| 166 |
+
temperature=args.temperature,
|
| 167 |
+
max_tokens=args.max_tokens
|
| 168 |
+
)
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"\nFatal error: {e}")
|
| 171 |
+
import traceback
|
| 172 |
+
traceback.print_exc()
|
| 173 |
+
sys.exit(1)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1 +1,11 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.42.0
|
| 2 |
+
pillow>=10.3.0,<11
|
| 3 |
+
pypdfium2==4.30.0
|
| 4 |
+
requests>=2.31.0,<3
|
| 5 |
+
huggingface_hub>=0.24.0
|
| 6 |
+
torch>=2.0.0
|
| 7 |
+
transformers>=4.36.0
|
| 8 |
+
accelerate>=0.26.0
|
| 9 |
+
safetensors>=0.4.0
|
| 10 |
+
# llama-cpp-python is optional for GGUF backend support (or use local build)
|
| 11 |
+
# llama-cpp-python>=0.3.0
|