Spaces:

DocUA
/

LightOnOCR-1B-Demo

Sleeping

App Files Files Community

DocUA commited on Jan 6

Commit

eb133b8

1 Parent(s): eed9900

feat: update ggml kernels, webui components, model templates, and build configurations

Browse files

Files changed (14) hide show

.gitignore +24 -0
README.md +61 -16
app.py +157 -146
backends/__init__.py +78 -0
backends/gguf_backend.py +138 -0
backends/pytorch_backend.py +119 -0
docs/gguf_setup.md +62 -0
docs/gguf_status.md +63 -0
docs/performance_optimization.md +60 -0
download_gguf_model.py +71 -0
download_model.py +107 -0
llama.cpp +1 -0
ocr_cli.py +177 -0
requirements.txt +11 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Python builds
+__pycache__/
+*.py[cod]
+*.so
+# Virtual environments
+.venv/
+venv/
+# Editor settings
+.DS_Store
+# Environment files
+.env
+.env.local
+# Model caches
+models/
+*.safetensors
+# Test docs
+test_docs/
+help_docs/

README.md CHANGED Viewed

@@ -1,16 +1,61 @@
----
-title: LightOnOCR 1B Demo
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
-hf_oauth: true
-hf_oauth_scopes:
-- inference-api
-license: apache-2.0
----
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

+# LightOnOCR-1B Demo
+High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon.
+## 🚀 Performance
+- **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
+- **PyTorch Backend:** ~40 seconds per page
+## Features
+- 📄 PDF and image support
+- 🔄 Seamless switching between GGUF and PyTorch backends
+- 🎛️ Configurable resolution (scale) and token generation
+- 🖥️ CLI and Gradio web interface
+- 🍎 Full Metal/MPS support
+## Quick Start
+### 1. Prerequisites
+- Python 3.10+
+- `cmake` and `git`
+```bash
+pip install -r requirements.txt
+pip install accelerate
+```
+### 2. Setup GGUF (Highly Recommended)
+See [GGUF Setup Guide](docs/gguf_setup.md).
+1. Build `llama.cpp` locally:
+   ```bash
+   git clone https://github.com/ggerganov/llama.cpp
+   cd llama.cpp && mkdir build && cd build
+   cmake .. -DGGML_METAL=ON && cmake --build . --config Release -j 8
+   cd ../..
+   ```
+2. Download model:
+   ```bash
+   python download_gguf_model.py
+   ```
+### 3. Usage
+**Command Line:**
+```bash
+# Fastest
+python ocr_cli.py document.pdf --backend gguf
+# High Quality
+python ocr_cli.py document.pdf --backend gguf --scale 2.0
+```
+**Web Interface:**
+```bash
+python app.py
+```
+Open http://127.0.0.1:7860 and select **GGUF** backend.
+## Documentation
+- [GGUF Setup Guide](docs/gguf_setup.md)
+- [Performance Optimization](docs/performance_optimization.md)

app.py CHANGED Viewed

@@ -1,76 +1,92 @@
 #!/usr/bin/env python3
 import os
-import json
-import base64
-import requests
 import gradio as gr
 from PIL import Image
-from io import BytesIO
 import pypdfium2 as pdfium
-ENDPOINT = os.environ.get("VLLM_ENDPOINT")
-MODEL = os.environ.get("VLLM_MODEL")
-if not ENDPOINT or not MODEL:
-    raise ValueError("VLLM_ENDPOINT and VLLM_MODEL environment variables must be set.")
-def image_to_base64(image):
-    buffered = BytesIO()
-    if image.mode == 'RGBA':
-        image = image.convert('RGB')
-    image.save(buffered, format="PNG")
-    return base64.b64encode(buffered.getvalue()).decode("utf-8")
-def render_pdf_page(page, max_resolution=1280, scale=2.77):
-    width, height = page.get_size()
-    pixel_width = width * scale
-    pixel_height = height * scale
-    resize_factor = min(max_resolution / pixel_width, max_resolution / pixel_height)
-    target_scale = scale * resize_factor
-    return page.render(scale=target_scale, rev_byteorder=True).to_pil()
-def process_pdf(pdf_path, num_pages=1):
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
-    pages_to_process = min(num_pages, total_pages, 5)
     images = []
     for i in range(pages_to_process):
         page = pdf[i]
-        img = render_pdf_page(page)
         images.append(img)
     pdf.close()
     return images, total_pages
-def process_input(file_input, temperature, num_pages):
     if file_input is None:
-        yield "Please upload an image or PDF first.", "", "", None
         return
     images_to_process = []
     page_info = ""
     display_image = None
-    file_path = file_input if isinstance(file_input, str) else file_input.name
-    if file_path.lower().endswith('.pdf'):
         try:
-            images_to_process, total_pages = process_pdf(file_path, num_pages)
             if len(images_to_process) == 0:
-                yield "Error: Could not extract pages from PDF.", "", "", None
                 return
             display_image = images_to_process[0]
-            if len(images_to_process) == 1:
-                page_info = f"Processing page 1 of {total_pages}"
-            else:
-                page_info = f"Processing {len(images_to_process)} pages of {total_pages}"
         except Exception as e:
-            yield f"Error processing PDF: {str(e)}", "", "", None
             return
     else:
         try:
@@ -79,84 +95,44 @@ def process_input(file_input, temperature, num_pages):
             display_image = img
             page_info = "Processing image"
         except Exception as e:
-            yield f"Error opening image: {str(e)}", "", "", None
             return
-    content = [{"type": "text", "text": ""}]
-    for img in images_to_process:
-        try:
-            b64_image = image_to_base64(img)
-            content.append({
-                "type": "image_url",
-                "image_url": {"url": f"data:image/png;base64,{b64_image}"}
-            })
-        except Exception as e:
-            yield f"Error encoding image: {str(e)}", "", "", display_image
-            return
-    payload = {
-        "model": MODEL,
-        "messages": [
-            {
-                "role": "user",
-                "content": content
-            }
-        ],
-        "temperature": temperature,
-        "stream": True
-    }
     try:
-        response = requests.post(
-            ENDPOINT,
-            headers={"Content-Type": "application/json"},
-            data=json.dumps(payload),
-            stream=True
-        )
-        response.raise_for_status()
-        accumulated_response = ""
-        first_chunk = True
-        for line in response.iter_lines():
-            if line:
-                line = line.decode('utf-8')
-                if line.startswith('data: '):
-                    line = line[6:]
-                if line.strip() == '[DONE]':
-                    break
-                try:
-                    chunk = json.loads(line)
-                    if 'choices' in chunk and len(chunk['choices']) > 0:
-                        delta = chunk['choices'][0].get('delta', {})
-                        content_delta = delta.get('content', '')
-                        if content_delta:
-                            accumulated_response += content_delta
-                            if first_chunk:
-                                yield accumulated_response, accumulated_response, page_info, display_image
-                                first_chunk = False
-                            else:
-                                yield accumulated_response, accumulated_response, page_info, gr.update()
-                except json.JSONDecodeError:
-                    continue
     except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        yield error_msg, error_msg, page_info, display_image
-with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 📖 Image/PDF to Text Extraction
-        **💡 How to use:**
-        1. Upload an image or PDF
-        2. For PDFs: choose how many pages to process (1-5, default is 1)
-        3. Adjust temperature if needed
-        4. Click "Extract Text"
         """
     )
@@ -168,62 +144,97 @@ with gr.Blocks(title="📖 Image/PDF OCR", theme=gr.themes.Soft()) as demo:
                 type="filepath"
             )
             rendered_image = gr.Image(
-                label="📄 Preview (First Page)",
                 type="pil",
-                height=400,
                 interactive=False
             )
-            num_pages = gr.Slider(
-                minimum=1,
-                maximum=5,
-                value=1,
-                step=1,
-                label="PDF: Number of Pages to Process",
-                info="Only applies to PDF files (max 5 pages)"
-            )
             page_info = gr.Textbox(
                 label="Processing Info",
                 value="",
                 interactive=False
             )
-            temperature = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.2,
-                step=0.05,
-                label="Temperature"
-            )
-            submit_btn = gr.Button("Extract Text", variant="primary")
-            clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=2):
-            output_text = gr.Markdown(
-                label="📄 Extracted Text (Rendered)",
-                value="<div style='min-height: 600px; padding: 10px; border: 1px solid #e0e0e0; border-radius: 4px; background-color: #f9f9f9;'><em>Extracted text will appear here...</em></div>",
-                height=600
-            )
-    with gr.Row():
-        with gr.Column():
-            raw_output = gr.Textbox(
-                label="Raw Markdown Output",
-                placeholder="Raw text will appear here...",
-                lines=20,
-                max_lines=30,
-                show_copy_button=True
             )
     submit_btn.click(
         fn=process_input,
-        inputs=[file_input, temperature, num_pages],
-        outputs=[output_text, raw_output, page_info, rendered_image]
     )
     clear_btn.click(
-        fn=lambda: (None, "", "", "", None, 1),
-        outputs=[file_input, output_text, raw_output, page_info, rendered_image, num_pages]
     )
 if __name__ == "__main__":
-    demo.launch()

 #!/usr/bin/env python3
+"""
+Gradio web interface for LightOnOCR-1B with backend support.
+"""
 import os
+import sys
 import gradio as gr
+from pathlib import Path
 from PIL import Image
 import pypdfium2 as pdfium
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent))
+from backends import create_backend, get_available_backends
+# Global backend
+BACKEND = None
+CURRENT_BACKEND_NAME = "pytorch"
+def load_backend(backend_name="pytorch"):
+    """Load OCR backend."""
+    global BACKEND, CURRENT_BACKEND_NAME
+    if BACKEND is None or CURRENT_BACKEND_NAME != backend_name:
+        print(f"Loading {backend_name} backend...")
+        BACKEND = create_backend(backend_name)
+        BACKEND.load_model()
+        CURRENT_BACKEND_NAME = backend_name
+        print(f"Backend loaded: {BACKEND.get_backend_info()}")
+    return BACKEND
+def render_pdf_page(page, scale=2.0):
+    """Render PDF page to PIL Image."""
+    return page.render(scale=scale, rev_byteorder=True).to_pil()
+def process_pdf(pdf_path, num_pages=1, scale=2.0):
+    """Extract images from PDF."""
     pdf = pdfium.PdfDocument(pdf_path)
     total_pages = len(pdf)
+    pages_to_process = min(num_pages, total_pages, 10)  # Max 10 pages
     images = []
     for i in range(pages_to_process):
         page = pdf[i]
+        img = render_pdf_page(page, scale=scale)
         images.append(img)
     pdf.close()
     return images, total_pages
+def process_input(file_input, backend_name, scale, temperature, max_tokens, num_pages):
+    """Process uploaded file with OCR."""
     if file_input is None:
+        yield "Idle", "Please upload an image or PDF first.", "", "", None
         return
+    # Load backend
+    try:
+        backend = load_backend(backend_name)
+    except Exception as e:
+        error_msg = f"Error loading backend: {str(e)}"
+        yield "Error", error_msg, error_msg, "", None
+        return
     images_to_process = []
     page_info = ""
     display_image = None
+    file_path = Path(file_input) if isinstance(file_input, str) else Path(file_input.name)
+    if not file_path.exists():
+        yield "Error", f"File not accessible: {file_path}", "", "", None
+        return
+    # Load images
+    if file_path.suffix.lower() == '.pdf':
         try:
+            images_to_process, total_pages = process_pdf(str(file_path), num_pages, scale)
             if len(images_to_process) == 0:
+                yield "Error", "Could not extract pages from PDF.", "", "", None
                 return
             display_image = images_to_process[0]
+            page_info = f"Processing {len(images_to_process)} of {total_pages} pages"
         except Exception as e:
+            yield "Error", f"Error processing PDF: {str(e)}", "", "", None
             return
     else:
         try:
             display_image = img
             page_info = "Processing image"
         except Exception as e:
+            yield "Error", f"Error opening image: {str(e)}", "", "", None
             return
+    # Process with OCR
     try:
+        yield "Processing...", "Processing images...", "", page_info, display_image
+        all_texts = []
+        for i, img in enumerate(images_to_process):
+            try:
+                print(f"Processing page {i+1}/{len(images_to_process)}...")
+                text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
+                all_texts.append(text.strip())
+                # Update progress
+                full_text = "\n\n---\n\n".join(all_texts)
+                yield "Processing...", full_text, full_text, page_info, display_image
+            except Exception as e:
+                error_msg = f"Error on page {i+1}: {str(e)}"
+                print(f"ERROR: {error_msg}")
+                all_texts.append(f"[{error_msg}]")
+                continue
+        # Final result
+        final_text = "\n\n---\n\n".join(all_texts)
+        yield "Complete", final_text, final_text, page_info, display_image
     except Exception as e:
+        error_msg = f"Error during processing: {str(e)}"
+        yield "Error", error_msg, "", page_info, display_image
+# Create Gradio interface
+with gr.Blocks(title="📖 LightOnOCR-1B Demo", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 📖 LightOnOCR-1B - OCR Demo
+        Upload an image or PDF to extract text with configurable quality/speed settings.
         """
     )
                 type="filepath"
             )
             rendered_image = gr.Image(
+                label="📄 Preview",
                 type="pil",
+                height=300,
                 interactive=False
             )
+            with gr.Accordion("⚙️ Settings", open=True):
+                backend_selector = gr.Radio(
+                    choices=get_available_backends(),
+                    value="pytorch",
+                    label="Backend",
+                    info="PyTorch: best quality | GGUF: faster (if available)"
+                )
+                scale_slider = gr.Slider(
+                    minimum=1.0,
+                    maximum=3.0,
+                    value=1.5,
+                    step=0.5,
+                    label="PDF Scale",
+                    info="Higher = better quality, slower"
+                )
+                max_tokens_slider = gr.Slider(
+                    minimum=256,
+                    maximum=2048,
+                    value=1024,
+                    step=256,
+                    label="Max Tokens",
+                    info="Lower = faster, may cut off long text"
+                )
+                num_pages = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=1,
+                    step=1,
+                    label="PDF Pages",
+                    info="Number of pages to process (max 10)"
+                )
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.1,
+                    step=0.05,
+                    label="Temperature",
+                    info="0 = deterministic"
+                )
             page_info = gr.Textbox(
                 label="Processing Info",
                 value="",
                 interactive=False
             )
+            submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
+            clear_btn = gr.Button("🗑️ Clear", variant="secondary")
         with gr.Column(scale=2):
+            status_display = gr.Textbox(
+                label="Status",
+                value="Idle",
+                interactive=False
             )
+            with gr.Tabs():
+                with gr.Tab("📄 Rendered"):
+                    output_text = gr.Markdown(
+                        value="*Extracted text will appear here...*",
+                        height=600
+                    )
+                with gr.Tab("📝 Raw Text"):
+                    raw_output = gr.Textbox(
+                        placeholder="Raw text will appear here...",
+                        lines=25,
+                        show_copy_button=True
+                    )
+    # Event handlers
     submit_btn.click(
         fn=process_input,
+        inputs=[file_input, backend_selector, scale_slider, temperature, max_tokens_slider, num_pages],
+        outputs=[status_display, output_text, raw_output, page_info, rendered_image]
     )
     clear_btn.click(
+        fn=lambda: ("Idle", None, "*Extracted text will appear here...*", "", "", None),
+        outputs=[status_display, file_input, output_text, raw_output, page_info, rendered_image]
     )
 if __name__ == "__main__":
+    demo.launch()

backends/__init__.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Backend interface for LightOnOCR-1B inference.
+Supports both PyTorch and GGUF backends.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+from PIL import Image
+class OCRBackend(ABC):
+    """Abstract base class for OCR backends."""
+    @abstractmethod
+    def load_model(self):
+        """Load the OCR model."""
+        pass
+    @abstractmethod
+    def process_image(self, image: Image.Image, temperature: float = 0.1) -> str:
+        """
+        Process a single image and return extracted text.
+        Args:
+            image: PIL Image to process
+            temperature: Sampling temperature (0 = greedy)
+        Returns:
+            Extracted text as string
+        """
+        pass
+    @abstractmethod
+    def get_backend_info(self) -> dict:
+        """Return backend information (name, device, memory usage, etc.)."""
+        pass
+def get_available_backends() -> List[str]:
+    """Return list of available backend names."""
+    backends = ["pytorch"]
+    # Check for GGUF support (binary or python package)
+    from pathlib import Path
+    project_root = Path(__file__).parent.parent
+    cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
+    if cli_path.exists():
+        backends.append("gguf")
+    else:
+        # Fallback check for python package (though we prefer CLI now)
+        try:
+            import llama_cpp
+            backends.append("gguf")
+        except ImportError:
+            pass
+    return backends
+def create_backend(backend_name: str) -> OCRBackend:
+    """
+    Factory function to create backend instance.
+    Args:
+        backend_name: "pytorch" or "gguf"
+    Returns:
+        OCRBackend instance
+    """
+    if backend_name == "pytorch":
+        from .pytorch_backend import PyTorchBackend
+        return PyTorchBackend()
+    elif backend_name == "gguf":
+        from .gguf_backend import GGUFBackend
+        return GGUFBackend()
+    else:
+        raise ValueError(f"Unknown backend: {backend_name}. Available: {get_available_backends()}")

backends/gguf_backend.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+GGUF backend for LightOnOCR-1B using local llama-mtmd-cli binary.
+"""
+import os
+import io
+import tempfile
+import subprocess
+from pathlib import Path
+from PIL import Image
+from typing import Optional
+from . import OCRBackend
+class GGUFBackend(OCRBackend):
+    """GGUF-based OCR backend using local llama-mtmd-cli binary."""
+    def __init__(self, model_path: Optional[str] = None, mmproj_path: Optional[str] = None):
+        """
+        Initialize GGUF backend.
+        Args:
+            model_path: Path to GGUF model file
+            mmproj_path: Path to mmproj file
+        """
+        self.model_path = model_path
+        self.mmproj_path = mmproj_path
+        self.cli_path = self._find_cli_binary()
+        self._auto_detect_files()
+    def _find_cli_binary(self) -> Optional[str]:
+        """Find the llama-mtmd-cli binary."""
+        # Check project root llama.cpp build
+        project_root = Path(__file__).parent.parent
+        cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
+        if cli_path.exists():
+            return str(cli_path)
+        return None
+    def _auto_detect_files(self):
+        """Try to find GGUF model and mmproj files."""
+        if self.model_path and Path(self.model_path).exists():
+            if not self.mmproj_path:
+                model_dir = Path(self.model_path).parent
+                for mmproj_file in model_dir.glob("*mmproj*.gguf"):
+                    self.mmproj_path = str(mmproj_file)
+                    print(f"Auto-detected mmproj: {self.mmproj_path}")
+                    break
+            return
+        search_paths = [
+            Path.cwd() / "models",
+            Path.cwd() / "gguf_models",
+        ]
+        for search_path in search_paths:
+            if not search_path.exists():
+                continue
+            for gguf_file in search_path.rglob("*.gguf"):
+                if "lightonocr" in gguf_file.name.lower() and "mmproj" not in gguf_file.name.lower():
+                    self.model_path = str(gguf_file)
+                    print(f"Auto-detected model: {self.model_path}")
+                    model_dir = gguf_file.parent
+                    for mmproj_file in model_dir.glob("*mmproj*.gguf"):
+                        self.mmproj_path = str(mmproj_file)
+                        print(f"Auto-detected mmproj: {self.mmproj_path}")
+                        break
+                    break
+            if self.model_path:
+                break
+    def load_model(self):
+        """Verify model, mmproj and CLI binary exist."""
+        if not self.cli_path:
+            raise RuntimeError(
+                "llama-mtmd-cli binary not found.\n"
+                "Please build llama.cpp locally:\n"
+                "  git clone https://github.com/ggerganov/llama.cpp\n"
+                "  cd llama.cpp && mkdir build && cd build\n"
+                "  cmake .. -DGGML_METAL=ON && cmake --build . --config Release"
+            )
+        if not self.model_path or not Path(self.model_path).exists():
+            raise ValueError("GGUF model not found. Run download_gguf_model.py")
+        if not self.mmproj_path or not Path(self.mmproj_path).exists():
+            raise ValueError("mmproj file not found. Run download_gguf_model.py")
+        print(f"GGUF Backend ready:")
+        print(f"  CLI: {self.cli_path}")
+        print(f"  Model: {self.model_path}")
+        print(f"  Projector: {self.mmproj_path}")
+    def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
+        """Process image using llama-mtmd-cli."""
+        if not self.cli_path:
+            self.load_model()
+        # Save image to temp file
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
+            image.save(tmp_img.name)
+            tmp_img_path = tmp_img.name
+        try:
+            cmd = [
+                self.cli_path,
+                "-m", self.model_path,
+                "--mmproj", self.mmproj_path,
+                "--image", tmp_img_path,
+                "-p", "Extract all text from this image. Be precise and include all visible text.",
+                "--temp", str(temperature),
+                "--n-predict", str(max_tokens),
+                # "--log-disable" # Removed as it suppresses output
+            ]
+            # Run CLI
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                print(f"CLI Error: {result.stderr}")
+                raise RuntimeError(f"llama-mtmd-cli failed: {result.stderr}")
+            # stdout contains the generated text, stderr contains logs
+            return result.stdout.strip()
+        finally:
+            if os.path.exists(tmp_img_path):
+                os.unlink(tmp_img_path)
+    def get_backend_info(self) -> dict:
+        return {
+            "name": "GGUF (llama-mtmd-cli)",
+            "device": "Metal (via CLI)",
+            "model_path": self.model_path or "not found",
+            "mmproj_path": self.mmproj_path or "not found",
+            "cli_path": self.cli_path
+        }

backends/pytorch_backend.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+PyTorch backend for LightOnOCR-1B.
+Uses Mistral3ForConditionalGeneration with custom weight remapping.
+"""
+import torch
+import platform
+from pathlib import Path
+from PIL import Image
+from transformers import AutoConfig, PixtralProcessor, Mistral3ForConditionalGeneration
+from safetensors.torch import load_file
+from huggingface_hub import hf_hub_download
+from . import OCRBackend
+class PyTorchBackend(OCRBackend):
+    """PyTorch-based OCR backend using transformers."""
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.device = None
+        self.dtype = None
+        self.model_id = "lightonai/LightOnOCR-1B-1025"
+    def load_model(self):
+        """Load the PyTorch model with custom weight remapping."""
+        if self.model is not None:
+            return  # Already loaded
+        print(f"Loading {self.model_id} (PyTorch backend)...")
+        # Load processor
+        self.processor = PixtralProcessor.from_pretrained(self.model_id, trust_remote_code=True)
+        # Instantiate model with config
+        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
+        self.model = Mistral3ForConditionalGeneration(config)
+        # Download and remap weights
+        print("  Downloading and remapping weights...")
+        weights_path = hf_hub_download(repo_id=self.model_id, filename="model.safetensors")
+        state_dict = load_file(weights_path)
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            new_key = k
+            if "vision_encoder" in k:
+                new_key = k.replace("vision_encoder", "vision_tower")
+            if "vision_projection" in k:
+                new_key = k.replace("vision_projection", "multi_modal_projector")
+            new_state_dict[new_key] = v
+        self.model.load_state_dict(new_state_dict, strict=False)
+        # Determine device
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        if platform.system() == "Darwin" and "arm" in platform.machine().lower():
+            self.device = "mps"
+        # MPS has issues with float16, use float32
+        if self.device == "mps":
+            self.dtype = torch.float32
+        else:
+            self.dtype = torch.float16 if self.device == "cuda" else torch.float32
+        self.model = self.model.to(device=self.device, dtype=self.dtype)
+        self.model.eval()
+        print(f"  Model loaded on {self.device} ({self.dtype})")
+    def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
+        """Process image using PyTorch model."""
+        if self.model is None:
+            self.load_model()
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": "Extract all text from this image. Be precise and include all visible text."}
+                ]
+            }
+        ]
+        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        inputs = self.processor(text=prompt, images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Ensure pixel_values match model dtype (critical for MPS)
+        if 'pixel_values' in inputs:
+            inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=temperature > 0,
+                pad_token_id=self.processor.tokenizer.eos_token_id
+            )
+        input_len = inputs['input_ids'].shape[1] if 'input_ids' in inputs else 0
+        new_tokens = generated_ids[:, input_len:] if generated_ids.shape[1] > input_len else generated_ids
+        generated_text = self.processor.batch_decode(new_tokens, skip_special_tokens=True)[0]
+        return generated_text.strip()
+    def get_backend_info(self) -> dict:
+        """Return backend information."""
+        return {
+            "name": "PyTorch",
+            "device": str(self.device) if self.device else "not loaded",
+            "dtype": str(self.dtype) if self.dtype else "not loaded",
+            "model_id": self.model_id,
+            "loaded": self.model is not None
+        }

docs/gguf_setup.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# GGUF Backend Setup Guide
+## Quick Start (Recommended)
+Since `llama-cpp-python` doesn't yet support LightOnOCR, we must build `llama.cpp` locally.
+### 1. Build llama.cpp locally
+```bash
+# Clone repository
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+# Create build directory
+mkdir build && cd build
+# Build with Metal support (MacOS)
+cmake .. -DGGML_METAL=ON
+cmake --build . --config Release -j 8
+# Verify build
+./bin/llama-mtmd-cli --help
+```
+### 2. Download GGUF Model
+```bash
+# Return to project root
+cd ../../
+# Run download script
+python download_gguf_model.py
+```
+### 3. Use GGUF Backend
+```bash
+# CLI
+python ocr_cli.py document.pdf --backend gguf
+# Gradio UI
+python app.py
+# Select "gguf" from backend dropdown
+```
+## Performance
+The custom built `llama-mtmd-cli` provides incredible performance on Apple Silicon:
+| Backend | Time per Page | Speedup |
+|---------|---------------|---------|
+| PyTorch (Original) | ~4 mins | 1x |
+| PyTorch (Optimized) | ~40 sec | 6x |
+| **GGUF (llama-mtmd-cli)** | **~3 sec** | **80x** ⭐ |
+## Troubleshooting
+### "llama-mtmd-cli binary not found"
+Ensure you successfully built `llama.cpp` and the binary exists at `llama.cpp/build/bin/llama-mtmd-cli`.
+### "GGUF model not found"
+Run `python download_gguf_model.py` to download the required model files.

docs/gguf_status.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# GGUF Backend Status
+## Current Status: ⚠️ Not Yet Supported
+The GGUF backend infrastructure is **fully implemented and ready**, but cannot be used yet due to a limitation in llama.cpp.
+### Issue
+LightOnOCR-1B uses a custom multimodal projector type (`lightonocr`) that is not yet supported in the standard llama.cpp library:
+```
+clip_init: failed to load model: load_hparams: unknown projector type: lightonocr
+```
+### What's Ready
+✅ llama-cpp-python installed with Metal support
+✅ GGUF Q8_0 model downloaded (767MB)
+✅ mmproj file downloaded (417MB)
+✅ Complete backend implementation (`backends/gguf_backend.py`)
+✅ CLI and UI integration
+### What's Needed
+❌ llama.cpp support for LightOnOCR projector type
+### Workaround Options
+1. **Wait for official support** - Monitor llama.cpp repository
+2. **Use PyTorch backend** - Fully functional, ~40s per page
+3. **Contribute to llama.cpp** - Add LightOnOCR projector support
+### When Will GGUF Work?
+The GGUF backend will work automatically once llama.cpp adds support for the `lightonocr` projector type. No code changes will be needed in this project - just update llama-cpp-python:
+```bash
+pip install --upgrade llama-cpp-python
+```
+### Alternative: Use PyTorch
+The PyTorch backend is fully optimized and works well:
+```bash
+# Recommended settings
+python ocr_cli.py document.pdf --scale 1.0 --max-tokens 1024
+# Result: ~40 seconds per page
+```
+## References
+- [llama.cpp GitHub](https://github.com/ggerganov/llama.cpp)
+- [LightOnOCR GGUF Models](https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF)
+- [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+## Monitoring
+Check these for updates:
+- llama.cpp issues/PRs mentioning LightOnOCR
+- llama-cpp-python release notes
+- LightOnOCR Hugging Face discussions

docs/performance_optimization.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# Оптимізація швидкодії LightOnOCR-1B на M3 Max
+## Поточна ситуація
+- **PyTorch на MPS**: ~4 хвилини на сторінку (дуже повільно)
+- **Причина**: MPS backend значно повільніший за CUDA для трансформерів
+## Швидкі оптимізації (PyTorch)
+### 1. Зменшення max_tokens
+```python
+# У backends/pytorch_backend.py, рядок ~95
+generated_ids = self.model.generate(
+    **inputs,
+    max_new_tokens=1024,  # Було 2048, зменшити до 512-1024
+    temperature=temperature,
+    do_sample=temperature > 0,
+    pad_token_id=self.processor.tokenizer.eos_token_id
+)
+```
+### 2. Використання нижчої роздільної здатності
+```bash
+# Замість scale=1.5, використовуйте scale=1.0
+python ocr_cli.py document.pdf --scale 1.0
+```
+## Рекомендоване рішення: GGUF + llama.cpp
+### Чому GGUF швидший?
+- Оптимізований для Apple Silicon (Metal)
+- Квантизація (Q8_0) зменшує розмір і прискорює
+- Спеціалізований inference engine
+### Встановлення
+```bash
+# 1. Встановити llama-cpp-python з Metal support
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
+# 2. Завантажити GGUF модель
+git lfs install
+git clone https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF
+# Або
+git clone https://huggingface.co/Mungert/LightOnOCR-1B-1025-GGUF
+```
+### Використання
+```bash
+# Після завантаження моделі
+python ocr_cli.py document.pdf --backend gguf --model-path path/to/model.gguf
+```
+## Очікувані результати
+- **PyTorch оптимізований**: ~2-3 хвилини на сторінку
+- **GGUF Q8_0**: ~30-60 секунд на сторінку (орієнтовно)
+## Примітка
+GGUF backend у поточній версії є заглушкою. Для повної підтримки потрібно:
+1. Завантажити GGUF модель
+2. Реалізувати vision model support у `backends/gguf_backend.py`

download_gguf_model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+"""
+Download GGUF model and mmproj files for LightOnOCR-1B.
+"""
+import os
+import sys
+from pathlib import Path
+def download_gguf_model():
+    """Download GGUF model using git lfs."""
+    models_dir = Path("models/lightonocr-gguf")
+    models_dir.mkdir(parents=True, exist_ok=True)
+    print("=" * 60)
+    print("LightOnOCR-1B GGUF Model Download")
+    print("=" * 60)
+    print()
+    print("This will download ~1-2GB of model files.")
+    print(f"Target directory: {models_dir.absolute()}")
+    print()
+    # Check if git lfs is installed
+    import subprocess
+    try:
+        result = subprocess.run(["git", "lfs", "version"], capture_output=True, text=True)
+        if result.returncode != 0:
+            print("ERROR: git-lfs not installed!")
+            print()
+            print("Install git-lfs first:")
+            print("  macOS: brew install git-lfs")
+            print("  Then run: git lfs install")
+            sys.exit(1)
+    except FileNotFoundError:
+        print("ERROR: git not found!")
+        sys.exit(1)
+    # Clone repository
+    repo_url = "https://huggingface.co/ggml-org/LightOnOCR-1B-1025-GGUF"
+    if (models_dir / ".git").exists():
+        print(f"Model directory already exists. Updating...")
+        os.chdir(models_dir)
+        subprocess.run(["git", "pull"], check=True)
+    else:
+        print(f"Cloning from {repo_url}...")
+        subprocess.run([
+            "git", "clone",
+            repo_url,
+            str(models_dir)
+        ], check=True)
+    print()
+    print("✓ Download complete!")
+    print()
+    print("Downloaded files:")
+    for gguf_file in models_dir.glob("*.gguf"):
+        size_mb = gguf_file.stat().st_size / (1024 * 1024)
+        print(f"  - {gguf_file.name} ({size_mb:.1f} MB)")
+    print()
+    print("Recommended files for use:")
+    print("  Model: LightOnOCR-1B-1025-Q8_0.gguf (best quality)")
+    print("  mmproj: mmproj-Q8_0.gguf or mmproj-f16.gguf")
+    print()
+    print("Usage:")
+    print(f"  python ocr_cli.py document.pdf --backend gguf")
+    print()
+if __name__ == "__main__":
+    download_gguf_model()

download_model.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+import argparse
+import os
+import sys
+from pathlib import Path
+from huggingface_hub import HfApi, snapshot_download
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Download a model repository from Hugging Face Hub."
+    )
+    parser.add_argument(
+        "model_id",
+        nargs="?",
+        default="lightonai/LightOnOCR-1B-1025",
+        help="Model repository to download (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--revision",
+        default=None,
+        help="Specific git revision (branch/tag/commit) to download.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        default=None,
+        help="Cache directory where the model snapshot will be stored.",
+    )
+    parser.add_argument(
+        "--local-dir",
+        default=None,
+        help="Optional local directory to copy the snapshot into after download.",
+    )
+    parser.add_argument(
+        "--token",
+        default=None,
+        help="Hugging Face access token; defaults to HF_TOKEN or HUGGINGFACEHUB_API_TOKEN env vars.",
+    )
+    parser.add_argument(
+        "--allow-pattern",
+        action="append",
+        default=None,
+        help="File glob pattern(s) to include when downloading.",
+    )
+    parser.add_argument(
+        "--ignore-pattern",
+        action="append",
+        default=None,
+        help="File glob pattern(s) to exclude when downloading.",
+    )
+    parser.add_argument(
+        "--offline",
+        action="store_true",
+        help="Run in offline mode, using only the local cache.",
+    )
+    return parser.parse_args()
+def resolve_token(user_token: str | None) -> str | None:
+    if user_token:
+        return user_token
+    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+def ensure_auth(token: str | None) -> None:
+    if token:
+        return
+    try:
+        api = HfApi()
+        if api.whoami():
+            return
+    except Exception:
+        pass
+    raise RuntimeError(
+        "Hugging Face token not provided. Set HF_TOKEN or run `huggingface-cli login`."
+    )
+def main() -> None:
+    args = parse_args()
+    token = resolve_token(args.token)
+    if not args.offline:
+        ensure_auth(token)
+    try:
+        snapshot_path = snapshot_download(
+            repo_id=args.model_id,
+            revision=args.revision,
+            cache_dir=args.cache_dir,
+            local_dir=args.local_dir,
+            allow_patterns=args.allow_pattern,
+            ignore_patterns=args.ignore_pattern,
+            token=token,
+            local_files_only=args.offline,
+        )
+    except Exception as exc:
+        print(f"Failed to download {args.model_id}: {exc}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Model snapshot available at: {Path(snapshot_path).resolve()}")
+if __name__ == "__main__":
+    main()

llama.cpp ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit bd2a93d4753c4f00443f561ee039220283016ee8

ocr_cli.py ADDED Viewed

	@@ -0,0 +1,177 @@

+#!/usr/bin/env python3
+"""
+OCR CLI utility for LightOnOCR-1B with backend support.
+Supports PyTorch and GGUF backends for flexible performance/quality trade-offs.
+"""
+import os
+import sys
+import argparse
+import time
+from pathlib import Path
+from PIL import Image
+import pypdfium2 as pdfium
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent))
+from backends import create_backend, get_available_backends
+def render_pdf_page(page, scale=2.0):
+    """Render PDF page to PIL Image with configurable scale."""
+    return page.render(scale=scale, rev_byteorder=True).to_pil()
+def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0,
+                 temperature: float = 0.1, max_tokens: int = 1024):
+    """
+    Process PDF or image file with OCR.
+    Args:
+        input_path: Path to input file
+        backend_name: "pytorch" or "gguf"
+        scale: PDF rendering scale (lower = faster, higher = better quality)
+        temperature: Sampling temperature for generation
+        max_tokens: Maximum tokens to generate (lower = faster)
+    """
+    input_path = Path(input_path).resolve()
+    if not input_path.exists():
+        print(f"Error: File {input_path} not found.")
+        return
+    # Create backend
+    print(f"Initializing {backend_name} backend...")
+    backend = create_backend(backend_name)
+    backend.load_model()
+    info = backend.get_backend_info()
+    print(f"Backend info: {info}")
+    # Load images
+    images = []
+    if input_path.suffix.lower() == '.pdf':
+        print(f"\nProcessing PDF: {input_path.name}")
+        pdf = pdfium.PdfDocument(str(input_path))
+        num_pages = len(pdf)
+        print(f"  Total pages: {num_pages}")
+        print(f"  Rendering scale: {scale}x")
+        for i in range(num_pages):
+            print(f"  Rendering page {i+1}/{num_pages}...", end=" ")
+            start = time.time()
+            images.append(render_pdf_page(pdf[i], scale=scale))
+            print(f"({time.time() - start:.1f}s)")
+        pdf.close()
+    else:
+        print(f"Processing image: {input_path.name}")
+        images = [Image.open(input_path)]
+    # Process with OCR
+    all_texts = []
+    total_start = time.time()
+    for i, img in enumerate(images):
+        print(f"\n  OCR on page {i+1}/{len(images)}...", end=" ")
+        start = time.time()
+        try:
+            text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
+            elapsed = time.time() - start
+            all_texts.append(text)
+            print(f"({elapsed:.1f}s, {len(text)} chars)")
+            print(f"    Preview: {text[:80]}...")
+        except Exception as e:
+            print(f"ERROR: {e}")
+            all_texts.append(f"[Error processing page {i+1}: {e}]")
+    # Save results
+    final_output = "\n\n".join(all_texts)
+    output_path = input_path.with_suffix('.md')
+    output_path.write_text(final_output, encoding='utf-8')
+    total_time = time.time() - total_start
+    print(f"\n✓ OCR Complete!")
+    print(f"  Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)")
+    print(f"  Output: {output_path}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="OCR utility for LightOnOCR-1B with backend selection",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Process with PyTorch (default, best quality)
+  python ocr_cli.py document.pdf
+  # Process with GGUF (faster, requires llama-cpp-python)
+  python ocr_cli.py document.pdf --backend gguf
+  # Fast processing with lower resolution
+  python ocr_cli.py document.pdf --scale 1.5
+  # High quality with higher resolution
+  python ocr_cli.py document.pdf --scale 3.0
+        """
+    )
+    parser.add_argument(
+        "input_file",
+        nargs="?",
+        default="test_docs/Xerox Scan_11062025151244_unident.pdf",
+        help="Input PDF or image file (default: test PDF)"
+    )
+    parser.add_argument(
+        "--backend",
+        choices=get_available_backends(),
+        default="pytorch",
+        help="Backend to use for inference (default: pytorch)"
+    )
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=2.0,
+        help="PDF rendering scale (default: 2.0, range: 1.0-4.0)"
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.1,
+        help="Sampling temperature (default: 0.1, 0=greedy)"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=1024,
+        help="Maximum tokens to generate (default: 1024, range: 256-2048)"
+    )
+    args = parser.parse_args()
+    # Validate scale
+    if not 1.0 <= args.scale <= 4.0:
+        print("Warning: Scale should be between 1.0 and 4.0")
+    try:
+        process_file(
+            args.input_file,
+            backend_name=args.backend,
+            scale=args.scale,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens
+        )
+    except Exception as e:
+        print(f"\nFatal error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

	@@ -1 +1,11 @@
1	- ~~pypdfium2~~ == 4.30.0

+gradio==5.42.0
+pillow>=10.3.0,<11
+pypdfium2==4.30.0
+requests>=2.31.0,<3
+huggingface_hub>=0.24.0
+torch>=2.0.0
+transformers>=4.36.0
+accelerate>=0.26.0
+safetensors>=0.4.0
+# llama-cpp-python is optional for GGUF backend support (or use local build)
+# llama-cpp-python>=0.3.0