Spaces:

lucasgagneten
/

OCR-NER-Facturas

Sleeping

App Files Files Community

Lucas Gagneten commited on Oct 11

Commit

ced724a

1 Parent(s): a77e9d7

fix: Mejorar manejo de errores y configuración de Tesseract

Browse files

Files changed (1) hide show

app.py +105 -38

app.py CHANGED Viewed

@@ -22,22 +22,70 @@ ALL_NER_TAGS = [
 ]
 ALL_NER_TAGS = sorted(list(set(ALL_NER_TAGS))) # Limpiar y ordenar
-# Configuración de Tesseract (la instalación se maneja con setup.sh)
-# Asegurar que Tesseract utiliza el idioma español
-pytesseract.tesseract_cmd = 'tesseract'
 # --- 2. FUNCIONES DE PROCESAMIENTO ---
 def get_ocr_data(image: Image.Image):
     """Ejecuta Tesseract y devuelve la imagen, tokens y bboxes normalizados."""
     if image is None:
-        return None, []
-    W, H = image.size
-    # Obtener datos de la imagen con idioma español
-    hocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT, lang='spa')
-    tokens_data = []
     for i in range(len(hocr_data['text'])):
         text = hocr_data['text'][i].strip()
@@ -67,7 +115,12 @@ def get_ocr_data(image: Image.Image):
                 'ner_tag': 'O' # Inicializar con 'O'
             })
-    return image, tokens_data
 def draw_boxes(image: Image.Image, tokens_data: list, highlight_index: int = -1):
     """Dibuja un resaltado en la imagen para el bounding box seleccionado."""
@@ -94,8 +147,12 @@ def process_and_setup(image_file):
         empty_df = {'token': [], 'ner_tag': []}
         return None, [], None, empty_df, "Cargue una imagen para comenzar."
-    image_orig, tokens_data = get_ocr_data(image_file)
     if not tokens_data:
         empty_df = {'token': [], 'ner_tag': []}
         return image_orig, [], None, empty_df, "OCR completado. No se detectaron tokens válidos."
@@ -141,34 +198,40 @@ def export_data(image_orig: Image.Image, tokens_data: list):
     if not tokens_data:
         return None, "Error: No hay datos de anotación para exportar."
-    # Se usa JSON estructurado en lugar de PASCAL VOC, ya que PASCAL VOC es para
-    # detección de objetos, y este es un problema de NER a nivel de token/bbox,
-    # siendo JSON el formato estándar para el fine-tuning de LayoutXLM.
-    W, H = image_orig.size
-    output_data = {
-        'metadata': {
-            'image_size': [W, H],
-            'format': 'Structured JSON for LayoutXLM Fine-Tuning',
-            'note': 'Contains tokens, bboxes normalized to 0-1000, and NER tags.'
-        },
-        'annotations': []
-    }
-    for item in tokens_data:
-        output_data['annotations'].append({
-            'token': item['token'],
-            'bbox_normalized': item['bbox_norm'],
-            'ner_tag': item['ner_tag']
-        })
-    # Guardar el archivo temporal en el disco del Space para que pueda ser descargado
-    temp_file = "anotacion_factura.json"
-    with open(temp_file, 'w', encoding='utf-8') as f:
-        json.dump(output_data, f, ensure_ascii=False, indent=4)
-    return temp_file, "✅ Exportación exitosa. Descarga el archivo JSON."
 # --- 3. INTERFAZ GRADIO (GR.BLOCKS) ---
@@ -257,4 +320,8 @@ with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
     )
 if __name__ == "__main__":
-    app.launch()

 ]
 ALL_NER_TAGS = sorted(list(set(ALL_NER_TAGS))) # Limpiar y ordenar
+# Configuración de Tesseract
+# En Windows, necesitamos especificar la ruta completa al ejecutable de Tesseract
+if os.name == 'nt':  # Windows
+    tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+    if os.path.exists(tesseract_path):
+        pytesseract.tesseract_cmd = tesseract_path
+    else:
+        print("ADVERTENCIA: Tesseract no encontrado en la ruta por defecto de Windows.")
+        print("Por favor, instale Tesseract-OCR desde: https://github.com/UB-Mannheim/tesseract/wiki")
+        print("O actualice la variable pytesseract.tesseract_cmd con la ruta correcta.")
+else:  # Linux/Mac
+    pytesseract.tesseract_cmd = 'tesseract'
 # --- 2. FUNCIONES DE PROCESAMIENTO ---
 def get_ocr_data(image: Image.Image):
     """Ejecuta Tesseract y devuelve la imagen, tokens y bboxes normalizados."""
     if image is None:
+        return None, [], "Error: No se proporcionó ninguna imagen"
+    if not os.path.exists(pytesseract.tesseract_cmd):
+        return None, [], "Error: Tesseract no está instalado o la ruta no es correcta"
+    try:
+        W, H = image.size
+        # Obtener datos de la imagen con idioma español
+        hocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT, lang='spa')
+        tokens_data = []
+        for i in range(len(hocr_data['text'])):
+            text = hocr_data['text'][i].strip()
+            # Filtro: Nivel de palabra (5), texto no vacío, confianza > 50
+            if hocr_data['level'][i] == 5 and text and hocr_data['conf'][i] > 50:
+                left = hocr_data['left'][i]
+                top = hocr_data['top'][i]
+                width = hocr_data['width'][i]
+                height = hocr_data['height'][i]
+                # BBox normalizado a 0-1000 (para LayoutXLM)
+                bbox_normalized = [
+                    int(left * 1000 / W),
+                    int(top * 1000 / H),
+                    int((left + width) * 1000 / W),
+                    int((top + height) * 1000 / H)
+                ]
+                # BBox original (en píxeles, para dibujar)
+                bbox_original = [left, top, left + width, top + height]
+                tokens_data.append({
+                    'token': text,
+                    'bbox_norm': bbox_normalized,
+                    'bbox_orig': bbox_original,
+                    'ner_tag': 'O' # Inicializar con 'O'
+                })
+        return image, tokens_data, None
+    except Exception as e:
+        if "tesseract is not installed" in str(e):
+            return None, [], "Error: Tesseract no está instalado o no se encuentra en el PATH del sistema"
+        return None, [], f"Error durante el OCR: {str(e)}"
     for i in range(len(hocr_data['text'])):
         text = hocr_data['text'][i].strip()
                 'ner_tag': 'O' # Inicializar con 'O'
             })
+        return image, tokens_data, None
+    except pytesseract.TesseractNotFoundError:
+        return None, [], "Error: Tesseract no está instalado o no se encuentra en el PATH del sistema"
+    except Exception as e:
+        return None, [], f"Error durante el OCR: {str(e)}"
 def draw_boxes(image: Image.Image, tokens_data: list, highlight_index: int = -1):
     """Dibuja un resaltado en la imagen para el bounding box seleccionado."""
         empty_df = {'token': [], 'ner_tag': []}
         return None, [], None, empty_df, "Cargue una imagen para comenzar."
+    image_orig, tokens_data, error_msg = get_ocr_data(image_file)
+    if error_msg:
+        empty_df = {'token': [], 'ner_tag': []}
+        return None, [], None, empty_df, error_msg
     if not tokens_data:
         empty_df = {'token': [], 'ner_tag': []}
         return image_orig, [], None, empty_df, "OCR completado. No se detectaron tokens válidos."
     if not tokens_data:
         return None, "Error: No hay datos de anotación para exportar."
+    try:
+        # Se usa JSON estructurado en lugar de PASCAL VOC, ya que PASCAL VOC es para
+        # detección de objetos, y este es un problema de NER a nivel de token/bbox,
+        # siendo JSON el formato estándar para el fine-tuning de LayoutXLM.
+        W, H = image_orig.size
+        output_data = {
+            'metadata': {
+                'image_size': [W, H],
+                'format': 'Structured JSON for LayoutXLM Fine-Tuning',
+                'note': 'Contains tokens, bboxes normalized to 0-1000, and NER tags.'
+            },
+            'annotations': []
+        }
+        for item in tokens_data:
+            output_data['annotations'].append({
+                'token': item['token'],
+                'bbox_normalized': item['bbox_norm'],
+                'ner_tag': item['ner_tag']
+            })
+        # Guardar el archivo temporal en el disco del Space para que pueda ser descargado
+        temp_file = "anotacion_factura.json"
+        with open(temp_file, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, ensure_ascii=False, indent=4)
+        return temp_file, "✅ Exportación exitosa. Descarga el archivo JSON."
+    except IOError as e:
+        return None, f"Error al guardar el archivo: {str(e)}"
+    except Exception as e:
+        return None, f"Error durante la exportación: {str(e)}"
 # --- 3. INTERFAZ GRADIO (GR.BLOCKS) ---
     )
 if __name__ == "__main__":
+    try:
+        app.launch()
+    except Exception as e:
+        print(f"Error crítico durante la ejecución de la aplicación: {str(e)}")
+        raise