# File: llm_processor.py import os import json from huggingface_hub import hf_hub_download from llama_cpp import Llama # Model Configuration MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF" MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf" llm = None def load_llm_model(): """Downloads and loads the GGUF model from Hugging Face.""" global llm try: hf_token = os.getenv("HF_TOKEN") if not hf_token: raise EnvironmentError("HF_TOKEN environment variable not found.") print(f"Downloading model {MODEL_FILE}...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token) print("Loading GGUF model...") llm = Llama( model_path=model_path, n_ctx=2048, n_threads=2, n_gpu_layers=0, verbose=False ) print("GGUF model loaded successfully.") except Exception as e: print(f"Fatal error loading LLM: {e}") llm = None def generate_json_from_text(ocr_text: str) -> dict: """ Takes raw OCR text and uses the LLM to convert it into a structured JSON object. """ if not llm: raise RuntimeError("LLM is not available.") prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly: - Output ONLY the JSON object, with no additional text, markdown, or backticks. - Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa*' as 'Masala Dosa', 'ONION*DoSa' as 'Onion Dosa' – treat * or other artifacts as typos, not synonyms). - Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing. - Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context). - Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address). - For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items. - Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount. - Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit. - Be precise and fast – focus only on relevant data. **JSON Schema:** {{ "invoice_number": "string or null", "invoice_date": "DD-MM-YYYY or null", "seller": "string or null", "invoice_to": "string or null", "items": [ {{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }} ], "subtotal": "float or null", "tax_amount": "float or null", "grand_total": "float or null" }} **OCR Text:** {ocr_text} """ output = llm( prompt, max_tokens=1024, # Increased for longer JSON temperature=0.5, # Slightly higher for better reasoning top_p=0.9, stop=["<|endoftext|>", ""], echo=False ) generated_text = output["choices"][0]["text"].strip() try: start_idx = generated_text.find("{") end_idx = generated_text.rfind("}") + 1 if start_idx != -1 and end_idx != -1: json_str = generated_text[start_idx:end_idx] json_data = json.loads(json_str) return json_data else: raise json.JSONDecodeError("No JSON object found.", generated_text, 0) except json.JSONDecodeError: # Fallback: Return structured error with cleaned OCR text return { "error": "LLM failed to generate valid JSON.", "raw_output": generated_text, "cleaned_ocr_text": ocr_text }