Spaces:

lucasgagneten
/

OCR-NER-Facturas

Sleeping

App Files Files Community

Lucas Gagneten commited on Oct 13

Commit

fe821af

1 Parent(s): 43fda59

Generate json

Browse files

Files changed (5) hide show

.gitignore +0 -1
app.py +138 -102
image_loader.py +1 -1
label_editor.py +165 -68
ocr_processor.py +40 -8

.gitignore CHANGED Viewed

@@ -34,7 +34,6 @@ checkpoints/
 # Archivos de anotación
 annotations/
 *.xml
-*.txt
 # Gradio y Hugging Face Spaces
 gradio/

 # Archivos de anotación
 annotations/
 *.xml
 # Gradio y Hugging Face Spaces
 gradio/

app.py CHANGED Viewed

@@ -1,204 +1,240 @@
 import gradio as gr
-from image_loader import setup_image_components
-from ocr_processor import setup_tesseract, process_and_setup
-from label_editor import setup_label_components, update_ui, export_data, ALL_NER_TAGS
 # Configurar Tesseract al inicio
 setup_tesseract()
-# --- 2. INTERFAZ PRINCIPAL ---
-# --- 3. INTERFAZ GRADIO (GR.BLOCKS) ---
-# Función de limpieza (NUEVA FUNCIÓN)
 def clear_ui_and_reset_states():
     """Limpia los componentes de la interfaz y resetea los estados."""
     print("Reiniciando la interfaz y los estados...")
-    # Restablecer los estados a sus valores iniciales
-    new_image_orig_state = gr.State(None)
-    new_tokens_data_state = gr.State([])
-    new_highlight_index_state = gr.State(-1)
-    # Ocultar la visualización y mostrar el cargador
     image_input_update = gr.update(value=None, visible=True)
     image_output_update = gr.update(value=None, visible=False)
-    # Limpiar otros componentes
     df_update = gr.update(value=[])
     status_update = "Sube una imagen para comenzar..."
     return (
-        new_image_orig_state.value,  # image_orig_state
-        new_tokens_data_state.value, # tokens_data_state
-        new_highlight_index_state.value, # highlight_index_state
-        image_input_update,      # image_input_file
-        image_output_update,     # image_output_display
-        df_update,               # df_label_input
-        status_update            # status_output
     )
 with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
     gr.Markdown(
         """
         # 🧾 Anotador NER para Facturas Argentinas (LayoutXLM)
         **Instrucciones:**
-        1. **Sube** una imagen (el OCR se ejecuta automáticamente).
-        2. En la tabla (Dataframe) de la derecha, **edita** la columna `ner_tag` (menú desplegable) para asignar las etiquetas NER (B-X, I-X, O).
-        3. **Haz clic en una fila** de la tabla para **resaltar el Bounding Box** correspondiente en la imagen.
-        4. Al finalizar, haz clic en **"Exportar a JSON"** para descargar el resultado.
         """
     )
     # Componentes de estado
-    image_orig_state = gr.State(None) # Almacena la imagen original
-    tokens_data_state = gr.State([]) # Almacena los tokens, bboxes y etiquetas (el modelo de datos)
-    highlight_index_state = gr.State(-1) # Almacena el índice de la fila seleccionada
     with gr.Row():
         with gr.Column(scale=1):
-            # Columna Izquierda: Carga y Visualización de Imagen
-            # COMPONENTE DE ENTRADA (Visible por defecto, se oculta al subir la imagen)
             image_input_file = gr.Image(
                 type="pil",
-                label="1. Cargar Imagen de Factura (el OCR se ejecuta automáticamente)",
                 sources=["upload"],
                 height=300,
                 interactive=True,
                 visible=True
             )
-            # COMPONENTE DE SALIDA (Oculto por defecto, se muestra al completar el OCR)
             image_output_display = gr.Image(
                 type="pil",
                 label="Factura con Bounding Box Resaltado",
                 interactive=False,
-                height=500,
                 visible=False
             )
             status_output = gr.Markdown("Sube una imagen para comenzar...")
-            # 🚀 BOTÓN NUEVO
             btn_clear = gr.Button("🗑️ Quitar Imagen / Nuevo Documento", visible=True)
         with gr.Column(scale=2):
             # Columna Derecha: Edición de Etiquetas
-            gr.Markdown("### 2. Edición de Etiquetas NER (Hacer clic en la fila para resaltar)")
-            df_label_input, btn_export, file_output = setup_label_components()
-    # --- FLUJO DE EVENTOS ---
-    # 1. Paso 1: Procesar la imagen y obtener tokens (Función del OCR)
-    def process_image(image):
-        if image is None:
-            # Retornar el valor por defecto y mantener la visibilidad inicial
-            # La salida es: [orig_st, tokens_st, output_img, df_input, status, input_file_v, output_display_v]
-            return None, [], None, [], "Sube una imagen para comenzar...", gr.update(visible=True), gr.update(visible=False)
-        try:
-            result = process_and_setup(image)
-            print("Procesamiento OCR completado")
-            if result[0] is None:
-                # Si hay error, mantener la visibilidad de carga y resetear el display
-                return None, [], None, [], "Error en el procesamiento del OCR. Verifica logs.", gr.update(visible=True), gr.update(visible=False, value=None)
-            image_orig, tokens_data, highlighted_image, df_data, status = result
-            print(f"Tokens detectados: {len(tokens_data) if tokens_data else 0}")
-            # Convertir datos para el DataFrame de Gradio (lista de listas)
-            df_rows = []
-            if df_data and isinstance(df_data, dict):
-                for t, n in zip(df_data['token'], df_data['ner_tag']):
-                    df_rows.append([t, n])
-            # Devolvemos los estados y la actualización de visibilidad: Ocultar input, Mostrar output
-            return (
-                image_orig,
-                tokens_data,
-                highlighted_image,
-                df_rows,
-                status,
-                gr.update(visible=False), # Ocultar image_input_file
-                gr.update(visible=True)  # Mostrar image_output_display
-            )
-        except Exception as e:
-            print(f"Error en process_image: {str(e)}")
-            # En caso de error, mantener la visibilidad de carga y resetear el display
-            return None, [], None, [], f"Error: {str(e)}", gr.update(visible=True), gr.update(visible=False, value=None)
     # CONEXIÓN 1: EJECUTAR OCR AUTOMÁTICAMENTE AL CARGAR IMAGEN
     image_input_file.change(
         fn=process_image,
         inputs=[image_input_file],
-        # image_input_file y image_output_display deben estar aquí para controlar su visibilidad
         outputs=[
-            image_orig_state,
-            tokens_data_state,
-            image_output_display,
-            df_label_input,
-            status_output,
-            image_input_file,
-            image_output_display
         ],
         api_name=False
     )
-    # --- Funciones Auxiliares ---
-    def capture_highlight_index(evt: gr.SelectData):
-        """Captura el índice de la fila seleccionada si es válido."""
-        if evt and evt.index is not None and evt.index[0] is not None:
-            return evt.index[0]
-        return gr.State(-1)
-    # CONEXIÓN 2: Al hacer clic en una fila del Dataframe (Resaltado)
     df_label_input.select(
         fn=capture_highlight_index,
         inputs=None,
         outputs=[highlight_index_state],
         queue=False
     ).then(
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
-    # CONEXIÓN 3: Al editar una celda del Dataframe (Guardar la edición y refrescar el resaltado)
-    df_label_input.change(
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
-    # CONEXIÓN 4: Exportar
     btn_export.click(
         fn=export_data,
-        inputs=[image_orig_state, tokens_data_state],
         outputs=[file_output, status_output],
         api_name=False
     )
-    # CONEXIÓN 5: Limpiar y Reiniciar (NUEVA CONEXIÓN)
     btn_clear.click(
         fn=clear_ui_and_reset_states,
-        # No toma inputs
         inputs=None,
         outputs=[
             image_orig_state,
             tokens_data_state,
             highlight_index_state,
             image_input_file,
             image_output_display,
             df_label_input,
             status_output
         ],
         api_name=False

 import gradio as gr
+from image_loader import setup_image_components # Asumo que este módulo existe
+from ocr_processor import setup_tesseract, process_and_setup
+from label_editor import setup_label_components, update_ui, export_data, update_dataframe_and_state, display_selected_row, ALL_NER_TAGS
 # Configurar Tesseract al inicio
 setup_tesseract()
+# --- Función de Limpieza (Actualizada) ---
 def clear_ui_and_reset_states():
     """Limpia los componentes de la interfaz y resetea los estados."""
     print("Reiniciando la interfaz y los estados...")
+    # Valores de reseteo para los estados de Gradio
+    reset_image_orig_state = None
+    reset_tokens_data_state = []
+    reset_highlight_index_state = -1
+    reset_image_filename_state = None # Estado del nombre de archivo único
+    # Actualizaciones para los componentes de la interfaz
     image_input_update = gr.update(value=None, visible=True)
     image_output_update = gr.update(value=None, visible=False)
     df_update = gr.update(value=[])
+    # Componentes de edición (ocultar)
+    tb_update = gr.update(value="", visible=False)
+    dd_update = gr.update(value="O", visible=False)
     status_update = "Sube una imagen para comenzar..."
     return (
+        reset_image_orig_state,        # image_orig_state
+        reset_tokens_data_state,       # tokens_data_state
+        reset_highlight_index_state,   # highlight_index_state
+        reset_image_filename_state,    # image_filename_state
+        image_input_update,            # image_input_file
+        image_output_update,           # image_output_display
+        df_update,                     # df_label_input
+        tb_update,                     # tb_token_editor
+        dd_update,                     # dd_tag_selector
+        status_update                  # status_output
     )
+# --- FUNCIONES AUXILIARES DE FLUJO ---
+def process_image(image):
+    """Ejecuta el OCR y el preprocesamiento inicial."""
+    if image is None:
+        # Añadir None para image_filename_state en el retorno de error
+        return None, [], None, [], "Sube una imagen para comenzar...", gr.update(visible=True), gr.update(visible=False), None
+    try:
+        # process_and_setup ahora retorna: image_orig, tokens_data, highlighted_image, df_data, status, image_filename
+        result = process_and_setup(image)
+        if result[0] is None:
+            # Añadir None para image_filename_state en el retorno de error
+            return None, [], None, [], "Error en el procesamiento del OCR. Verifica logs.", gr.update(visible=True), gr.update(visible=False, value=None), None
+        # Desempaquetar el resultado
+        image_orig, tokens_data, highlighted_image, df_data, status, image_filename = result
+        # Convertir datos para el DataFrame de Gradio (lista de listas)
+        df_rows = []
+        if df_data and isinstance(df_data, dict):
+            for t, n in zip(df_data['token'], df_data['ner_tag']):
+                df_rows.append([t, n])
+        return (
+            image_orig,
+            tokens_data,
+            highlighted_image,
+            df_rows,
+            status,
+            gr.update(visible=False), # Ocultar image_input_file
+            gr.update(visible=True),  # Mostrar image_output_display
+            image_filename            # Nombre de archivo único
+        )
+    except Exception as e:
+        print(f"Error en process_image: {str(e)}")
+        # Asegurar que la función siempre retorne el número correcto de outputs
+        return None, [], None, [], f"Error: {str(e)}", gr.update(visible=True), gr.update(visible=False, value=None), None
+def capture_highlight_index(evt: gr.SelectData):
+    """Captura el índice de fila (0-index) seleccionado en el DataFrame."""
+    if evt and evt.index is not None and evt.index[0] is not None:
+        return evt.index[0]
+    return gr.State(-1)
+# --- INTERFAZ GRADIO (GR.BLOCKS) ---
 with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
     gr.Markdown(
         """
         # 🧾 Anotador NER para Facturas Argentinas (LayoutXLM)
         **Instrucciones:**
+        1. **Sube** una imagen. La imagen se guarda automáticamente en `dataset/imagenes`.
+        2. **Haz clic en una FILA** de la tabla para **seleccionar** el token y mostrar los editores.
+        3. El cambio de la **Etiqueta NER** (Dropdown) y la edición del **Token** (Textbox) se aplican **automáticamente**.
+        4. Al exportar, la información se **agrega** a `anotacion_factura.json`.
         """
     )
     # Componentes de estado
+    image_orig_state = gr.State(None)
+    tokens_data_state = gr.State([]) # Estado de la verdad que incluye tokens y bboxes
+    highlight_index_state = gr.State(-1) # Índice de la fila seleccionada
+    image_filename_state = gr.State(None) # Nombre de archivo único para la exportación
     with gr.Row():
         with gr.Column(scale=1):
+            # Columna Izquierda: Carga y Visualización
             image_input_file = gr.Image(
                 type="pil",
+                label="1. Cargar Imagen de Factura",
                 sources=["upload"],
                 height=300,
                 interactive=True,
                 visible=True
             )
             image_output_display = gr.Image(
                 type="pil",
                 label="Factura con Bounding Box Resaltado",
                 interactive=False,
+                height=800,
                 visible=False
             )
             status_output = gr.Markdown("Sube una imagen para comenzar...")
             btn_clear = gr.Button("🗑️ Quitar Imagen / Nuevo Documento", visible=True)
         with gr.Column(scale=2):
             # Columna Derecha: Edición de Etiquetas
+            gr.Markdown("### 2. Edición de Etiquetas NER")
+            # CAPTURAR COMPONENTES (df_label_input, tb_token_editor, dd_tag_selector, btn_export, file_output)
+            df_label_input, tb_token_editor, dd_tag_selector, btn_export, file_output = setup_label_components()
+            # Contenedor para los editores (Token y Tag en dos columnas contiguas)
+            with gr.Row(visible=True) as editor_row:
+                with gr.Column(scale=2):
+                    tb_token_editor # Textbox a la izquierda (más ancho)
+                with gr.Column(scale=1):
+                    dd_tag_selector # Dropdown a la derecha (más estrecho)
+    # --- CONEXIONES DE EVENTOS ---
     # CONEXIÓN 1: EJECUTAR OCR AUTOMÁTICAMENTE AL CARGAR IMAGEN
     image_input_file.change(
         fn=process_image,
         inputs=[image_input_file],
         outputs=[
+            image_orig_state, tokens_data_state, image_output_display, df_label_input, status_output,
+            image_input_file, image_output_display, image_filename_state # Nuevo estado
         ],
         api_name=False
     )
+    # CONEXIÓN 2: Al hacer clic en una FILA (Selección/Resaltado)
     df_label_input.select(
         fn=capture_highlight_index,
         inputs=None,
         outputs=[highlight_index_state],
         queue=False
     ).then(
+        # Paso A: Mostrar el token y la etiqueta en los editores externos
+        fn=display_selected_row,
+        inputs=[tokens_data_state, highlight_index_state],
+        outputs=[tb_token_editor, dd_tag_selector, highlight_index_state],
+    ).then(
+        # Paso B: Resaltar la fila en la imagen
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
+    # CONEXIÓN 3.1: Dropdown cambia la etiqueta NER (Actualización Automática con .change())
+    dd_tag_selector.change(
+        # Capturar el valor actual del Dropdown para pasarlo a la función de actualización
+        fn=lambda t, d, i, new_tag_val: update_dataframe_and_state(t, d, new_tag_val, None, i, 'tag'),
+        inputs=[tokens_data_state, df_label_input, highlight_index_state, dd_tag_selector],
+        outputs=[tokens_data_state, df_label_input],
+    ).then(
+        # Refrescar la imagen con el resaltado actualizado
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
+    # CONEXIÓN 3.2: Textbox cambia el Token (Actualización Automática con .blur y .submit)
+    token_update_events = [tb_token_editor.blur, tb_token_editor.submit]
+    for event in token_update_events:
+        event(
+            # Capturar el valor actual del Textbox
+            fn=lambda t, d, i, new_token_val: update_dataframe_and_state(t, d, None, new_token_val, i, 'token'),
+            inputs=[tokens_data_state, df_label_input, highlight_index_state, tb_token_editor],
+            outputs=[tokens_data_state, df_label_input],
+        ).then(
+            # Refrescar la imagen
+            fn=update_ui,
+            inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
+            outputs=[tokens_data_state, image_output_display],
+            api_name=False
+        )
+    # CONEXIÓN 4: Exportar (Añadiendo image_filename_state)
     btn_export.click(
         fn=export_data,
+        inputs=[image_orig_state, tokens_data_state, image_filename_state],
         outputs=[file_output, status_output],
         api_name=False
     )
+    # CONEXIÓN 5: Limpiar y Reiniciar
     btn_clear.click(
         fn=clear_ui_and_reset_states,
         inputs=None,
         outputs=[
             image_orig_state,
             tokens_data_state,
             highlight_index_state,
+            image_filename_state, # Nuevo output de limpieza
             image_input_file,
             image_output_display,
             df_label_input,
+            tb_token_editor,
+            dd_tag_selector,
             status_output
         ],
         api_name=False

image_loader.py CHANGED Viewed

@@ -15,7 +15,7 @@ def setup_image_components():
         type="pil",
         label="Factura con Bounding Box Resaltado",
         interactive=False,
-        height=500
     )
     status_output = gr.Markdown("---")

         type="pil",
         label="Factura con Bounding Box Resaltado",
         interactive=False,
+        height=800
     )
     status_output = gr.Markdown("---")

label_editor.py CHANGED Viewed

@@ -1,106 +1,203 @@
 import gradio as gr
 import json
 from error_handler import ErrorHandler
 from ner_tags import ALL_NER_TAGS
 # --- Funciones de Configuración y UI ---
 def setup_label_components():
     """
-    Configura y retorna los componentes de edición de etiquetas,
-    usando la sintaxis compatible y forzando el tipo 'category'.
     """
-    # 🟢 SOLUCIÓN: Usamos 'category' en el datatype para forzar el menú desplegable.
     df_label_input = gr.Dataframe(
         headers=["token", "ner_tag"],
         col_count=(2, "fixed"),
-        # El primer elemento ('str') es la columna "token".
-        # El segundo elemento (el diccionario) fuerza el tipo 'category' y define las opciones.
-        datatype=["str", {"type": "str", "choices": ALL_NER_TAGS, "default": "O"}],
-        label="Tabla de Tokens y Etiquetas",
-        interactive=True,
         wrap=True,
-        value=[]
     )
-    btn_export = gr.Button("3. Exportar a JSON para Fine-Tuning", variant="secondary")
     file_output = gr.File(label="Archivo de Anotación JSON")
-    return df_label_input, btn_export, file_output
-def update_ui(image_orig, tokens_data: list, df_labels: list, highlight_index: int):
     """
-    Actualiza el estado interno de los tokens (con las nuevas etiquetas del DataFrame)
-    y regenera la imagen resaltada.
     """
-    # Importación local para evitar ciclos
-    from ocr_processor import draw_boxes
-    new_tokens_data = []
-    if isinstance(df_labels, list) and tokens_data and len(df_labels) == len(tokens_data):
-        for i, item in enumerate(tokens_data):
-            new_tag = item['ner_tag']
-            try:
-                # La etiqueta está en el índice 1 de la fila del Dataframe
-                if len(df_labels[i]) > 1:
-                   new_tag = df_labels[i][1]
-            except Exception:
-                pass
-            new_tokens_data.append({
-                'token': item['token'],
-                'bbox_norm': item['bbox_norm'],
-                'bbox_orig': item['bbox_orig'],
-                'ner_tag': new_tag
-            })
     else:
-        new_tokens_data = tokens_data
-    # Generar la imagen resaltada
-    highlighted_image = draw_boxes(image_orig, new_tokens_data, highlight_index)
-    return new_tokens_data, highlighted_image
 # --- Función de Exportación ---
-def export_data(image_orig, tokens_data: list):
-    """Exporta los datos anotados a un archivo JSON estructurado."""
-    if not tokens_data:
-        ErrorHandler.show_error("Error: No hay datos de anotación para exportar.")
         return None, None
     try:
-        W, H = image_orig.size
-        output_data = {
-            'metadata': {
-                'image_size': [W, H],
-                'format': 'Structured JSON for LayoutXLM Fine-Tuning',
-                'note': 'Contains tokens, bboxes normalized to 0-1000, and NER tags.'
-            },
-            'annotations': []
-        }
-        for item in tokens_data:
-            output_data['annotations'].append({
-                'token': item['token'],
-                'bbox_normalized': item['bbox_norm'],
-                'ner_tag': item['ner_tag']
-            })
-        temp_file = "anotacion_factura.json"
         with open(temp_file, 'w', encoding='utf-8') as f:
-            json.dump(output_data, f, ensure_ascii=False, indent=4)
-        gr.Info("✅ Exportación exitosa. Descarga el archivo JSON.")
-        return temp_file, "Exportación a JSON completada con éxito."
     except Exception as e:
         error_msg = ErrorHandler.handle_export_error(e)
-        gr.Warning(f"Error al exportar: {error_msg}")
         return None, f"Error en exportación: {error_msg}"

 import gradio as gr
+import os
 import json
+import pandas as pd
 from error_handler import ErrorHandler
 from ner_tags import ALL_NER_TAGS
+# Definir la ruta base del dataset (debe coincidir con ocr_processor.py si es necesario)
+DATASET_BASE_DIR = "dataset"
+JSON_FILENAME = "anotacion_factura.json"
 # --- Funciones de Configuración y UI ---
 def setup_label_components():
     """
+    Configura y retorna los componentes de edición de etiquetas:
+    DataFrame (no interactivo), Textbox para Token, Dropdown para Tag,
+    botón de exportar y salida de archivo.
     """
+    # 1. Dataframe NO INTERACTIVO (Solo para visualización y selección de fila)
     df_label_input = gr.Dataframe(
         headers=["token", "ner_tag"],
         col_count=(2, "fixed"),
+        datatype=["str", "str"],
+        label="Tabla de Tokens y Etiquetas (Haga clic en la FILA para seleccionar y editar abajo)",
+        interactive=False, # Deshabilitar la edición directa
         wrap=True,
+        value=[]
     )
+    # 2. NUEVOS COMPONENTES DE EDICIÓN EXTERNOS
+    # Input para editar el token
+    tb_token_editor = gr.Textbox(
+        # Eliminamos la instrucción "presionar ENTER" para reflejar el cambio automático
+        label="Token Seleccionado",
+        interactive=True,
+        visible=False # Oculto por defecto
+    )
+    # Dropdown para editar la etiqueta NER
+    dd_tag_selector = gr.Dropdown(
+        choices=ALL_NER_TAGS,
+        label="Etiqueta NER Seleccionada",
+        value="O",
+        interactive=True,
+        visible=False  # Oculto por defecto
+    )
+    # Eliminamos btn_apply_token
+    # Resto de componentes
+    btn_export = gr.Button("Exportar a JSON para Fine-Tuning", variant="secondary")
     file_output = gr.File(label="Archivo de Anotación JSON")
+    # Retornamos los nuevos componentes (sin btn_apply_token)
+    return df_label_input, tb_token_editor, dd_tag_selector, btn_export, file_output
+# --- FUNCIÓN: Obtener la fila seleccionada y mostrar editores ---
+def display_selected_row(tokens_data: list, highlight_index: int):
     """
+    Muestra el token y la etiqueta de la fila seleccionada en los editores externos.
     """
+    if highlight_index >= 0 and highlight_index < len(tokens_data):
+        token = tokens_data[highlight_index]['token']
+        ner_tag = tokens_data[highlight_index]['ner_tag']
+        # Muestra los componentes
+        visible_update = gr.update(visible=True)
+        # DEVOLVER LA ETIQUETA ACTUAL del token para inicializar el Dropdown correctamente
+        return (
+            gr.update(value=token, visible=True),        # tb_token_editor
+            gr.update(value=ner_tag, visible=True),      # dd_tag_selector
+            highlight_index
+        )
+    # Si no hay selección válida, oculta los componentes
+    hidden_update = gr.update(visible=False)
+    return gr.update(value="", visible=False), hidden_update, -1
+# --- FUNCIÓN: Actualizar el Dataframe y el estado de los tokens (Mantener) ---
+def update_dataframe_and_state(tokens_data: list, df_data_current: list, new_tag: str, new_token: str, row_index: int, update_type: str):
+    """
+    Función unificada para actualizar la lista de tokens y el Dataframe.
+    (La lógica de esta función se mantiene sin cambios)
+    """
+    if isinstance(df_data_current, pd.DataFrame):
+        df_list = df_data_current.values.tolist()
     else:
+        df_list = df_data_current
+    if row_index < 0 or row_index >= len(df_list):
+        return tokens_data, df_list
+    if update_type == 'tag':
+        df_list[row_index][1] = new_tag
+        tokens_data[row_index]['ner_tag'] = new_tag
+    elif update_type == 'token':
+        df_list[row_index][0] = new_token
+        tokens_data[row_index]['token'] = new_token
+    return tokens_data, df_list
+# --- FUNCIÓN: Actualizar la UI al cambiar la selección ---
+def update_ui(image_orig, tokens_data: list, df_labels: list, highlight_index: int):
+    from ocr_processor import draw_boxes
+    highlighted_image = draw_boxes(image_orig, tokens_data, highlight_index)
+    return tokens_data, highlighted_image
 # --- Función de Exportación ---
+def export_data(image_orig, tokens_data: list, image_filename: str): # <-- ACEPTAR image_filename
+    """
+    Exporta los datos anotados a un archivo JSON estructurado.
+    Si el archivo existe, agrega las nuevas anotaciones al final.
+    """
+    if not tokens_data or not image_filename: # Validar que el nombre del archivo exista
+        ErrorHandler.show_error("Error: No hay datos de anotación o nombre de archivo para exportar.")
         return None, None
+    # 1. Asegurarse de que la carpeta 'dataset' exista
+    os.makedirs(DATASET_BASE_DIR, exist_ok=True)
+    # 2. Definir la ruta completa del archivo JSON
+    temp_file = os.path.join(DATASET_BASE_DIR, JSON_FILENAME)
+    new_annotations = []
+    # 1. Preparar las nuevas anotaciones en el formato requerido
+    for item in tokens_data:
+        new_annotations.append({
+            'token': item['token'],
+            'bbox_normalized': [int(b) for b in item['bbox_norm']],
+            'ner_tag': item['ner_tag']
+        })
+    # 2. Preparar el nuevo elemento a agregar al array de anotaciones (el objeto completo de la factura)
+    W, H = image_orig.size
+    new_document_entry = {
+        'image': {
+            'size': [W, H],
+            'name': image_filename # <-- USAR EL NOMBRE ÚNICO
+        },
+        'annotations': new_annotations
+    }
+    # 3. Leer el archivo existente y obtener los datos previos
+    # El archivo JSON principal será un ARRAY de objetos de documentos.
+    existing_document_list = []
+    total_annotations_count = 0
+    try:
+        if os.path.exists(temp_file):
+            with open(temp_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                # ASUMIMOS que el archivo JSON es una lista [] de documentos
+                if isinstance(data, list):
+                    existing_document_list = data
+                # Si el archivo está en el formato antiguo {metadata: {}, annotations: []} (solo 1 doc)
+                elif isinstance(data, dict) and 'annotations' in data:
+                    # Lo convertimos a la nueva estructura de lista si solo tiene un documento
+                    # Nota: Esto es peligroso, idealmente el formato de exportación debe ser consistente.
+                    # Asumiremos la nueva estructura JSON será: [ {doc1}, {doc2}, ... ]
+                    pass
+    except json.JSONDecodeError:
+        gr.Warning(f"Advertencia: El archivo {temp_file} existe pero está corrupto/vacío. Se creará uno nuevo.")
+    except Exception as e:
+        ErrorHandler.handle_export_error(e)
+        gr.Warning(f"Error al leer el archivo existente. Se agregará solo este documento.")
+    # 4. Consolidar los datos y contar
+    # 4.1. Agregar el nuevo documento a la lista
+    existing_document_list.append(new_document_entry)
+    # 4.2. Contar el total de tokens en todas las facturas
+    for doc in existing_document_list:
+        total_annotations_count += len(doc.get('annotations', []))
+    # 5. Escribir la lista completa de documentos de vuelta al archivo
     try:
         with open(temp_file, 'w', encoding='utf-8') as f:
+            # Escribir la lista de documentos directamente
+            json.dump(existing_document_list, f, ensure_ascii=False, indent=4)
+        gr.Info(f"✅ Exportación exitosa. Documento '{image_filename}' agregado. Total de documentos: {len(existing_document_list)}. Tokens totales: {total_annotations_count}")
+        return temp_file, f"Exportación JSON completada. Documentos: {len(existing_document_list)}, Tokens: {total_annotations_count}"
     except Exception as e:
         error_msg = ErrorHandler.handle_export_error(e)
+        gr.Warning(f"Error al escribir el archivo: {error_msg}")
         return None, f"Error en exportación: {error_msg}"

ocr_processor.py CHANGED Viewed

@@ -1,14 +1,19 @@
 import pytesseract
 from PIL import Image, ImageDraw
 import os
-import cv2 # Necesitas instalar: pip install opencv-python
-import numpy as np # Necesitas instalar: pip install numpy
-import pandas as pd # Necesitas instalar: pip install pandas
 # Asumiendo que ErrorHandler es una clase o módulo que manejas aparte
 # Si no lo tienes, deberás crear una implementación simple o manejar las excepciones directamente.
 from error_handler import ErrorHandler
 # Configuración de Tesseract
 TESSERACT_CONFIG = r'--oem 3 --psm 3'
 # --oem 3: Usar motor LSTM (moderno)
@@ -147,24 +152,49 @@ def draw_boxes(image: Image.Image, tokens_data: list, highlight_index: int = -1)
     return img_copy
 def process_and_setup(image_file):
-    """Función inicial: OCR y configuración del estado para la UI."""
     if image_file is None:
         empty_df = {'token': [], 'ner_tag': []}
         ErrorHandler.show_error("Cargue una imagen para comenzar.")
-        return None, [], None, empty_df, None
     image_orig, tokens_data, _ = get_ocr_data(image_file)
     if image_orig is None:
         empty_df = {'token': [], 'ner_tag': []}
-        return None, [], None, empty_df, "Error al procesar el OCR. Revise el log."
     if not tokens_data:
         empty_df = {'token': [], 'ner_tag': []}
         msg = "OCR completado. No se detectaron tokens válidos."
         ErrorHandler.show_error(msg)
-        return image_orig, [], None, empty_df, msg
     # Crear el DataFrame inicial para la edición en Gradio
     df_data = {
@@ -176,4 +206,6 @@ def process_and_setup(image_file):
     highlighted_image = image_orig.copy()
     msg = f"OCR completado. Tokens detectados: {len(tokens_data)}"
     print(msg)
-    return image_orig, tokens_data, highlighted_image, df_data, msg

 import pytesseract
 from PIL import Image, ImageDraw
 import os
+import cv2
+import numpy as np
+import pandas as pd
+import uuid
 # Asumiendo que ErrorHandler es una clase o módulo que manejas aparte
 # Si no lo tienes, deberás crear una implementación simple o manejar las excepciones directamente.
 from error_handler import ErrorHandler
+DATASET_BASE_DIR = "dataset"
+IMAGES_DIR = os.path.join(DATASET_BASE_DIR, "imagenes")
 # Configuración de Tesseract
 TESSERACT_CONFIG = r'--oem 3 --psm 3'
 # --oem 3: Usar motor LSTM (moderno)
     return img_copy
+def save_image_to_dataset(image: Image.Image) -> str:
+    """
+    Genera un nombre único, crea la carpeta y guarda la imagen en formato JPEG.
+    Retorna el nombre del archivo.
+    """
+    # 1. Crear el nombre único (UUID + extensión)
+    unique_filename = f"{uuid.uuid4()}.jpeg"
+    save_path = os.path.join(IMAGES_DIR, unique_filename)
+    # 2. Asegurar que el directorio exista
+    os.makedirs(IMAGES_DIR, exist_ok=True)
+    # 3. Guardar la imagen
+    # Usamos save() en lugar de guardar en memoria
+    image.save(save_path, format="JPEG")
+    print(f"Imagen guardada: {save_path}")
+    return unique_filename
 def process_and_setup(image_file):
+    """
+    Función inicial: OCR, configuración del estado y ahora, guardar la imagen.
+    """
     if image_file is None:
         empty_df = {'token': [], 'ner_tag': []}
         ErrorHandler.show_error("Cargue una imagen para comenzar.")
+        return None, [], None, empty_df, None, None # <-- Nuevo retorno: image_filename
     image_orig, tokens_data, _ = get_ocr_data(image_file)
     if image_orig is None:
         empty_df = {'token': [], 'ner_tag': []}
+        return None, [], None, empty_df, "Error al procesar el OCR. Revise el log.", None # <-- Nuevo retorno: image_filename
+    # --- CAMBIO CLAVE: Guardar la imagen y obtener su nombre único ---
+    image_filename = save_image_to_dataset(image_orig)
     if not tokens_data:
         empty_df = {'token': [], 'ner_tag': []}
         msg = "OCR completado. No se detectaron tokens válidos."
         ErrorHandler.show_error(msg)
+        return image_orig, [], None, empty_df, msg, image_filename # <-- Devolver nombre
     # Crear el DataFrame inicial para la edición en Gradio
     df_data = {
     highlighted_image = image_orig.copy()
     msg = f"OCR completado. Tokens detectados: {len(tokens_data)}"
     print(msg)
+    # Devolver el nombre único de la imagen junto con los estados
+    return image_orig, tokens_data, highlighted_image, df_data, msg, image_filename