Spaces:

lucasgagneten
/

OCR-NER-Facturas

Sleeping

App Files Files Community

Lucas Gagneten commited on Oct 13

Commit

cd12bfc

1 Parent(s): fe821af

dataset.zip

Browse files

Files changed (2) hide show

app.py +57 -40
label_editor.py +124 -76

app.py CHANGED Viewed

@@ -1,12 +1,20 @@
 import gradio as gr
-from image_loader import setup_image_components # Asumo que este módulo existe
 from ocr_processor import setup_tesseract, process_and_setup
-from label_editor import setup_label_components, update_ui, export_data, update_dataframe_and_state, display_selected_row, ALL_NER_TAGS
 # Configurar Tesseract al inicio
 setup_tesseract()
-# --- Función de Limpieza (Actualizada) ---
 def clear_ui_and_reset_states():
     """Limpia los componentes de la interfaz y resetea los estados."""
@@ -16,7 +24,7 @@ def clear_ui_and_reset_states():
     reset_image_orig_state = None
     reset_tokens_data_state = []
     reset_highlight_index_state = -1
-    reset_image_filename_state = None # Estado del nombre de archivo único
     # Actualizaciones para los componentes de la interfaz
     image_input_update = gr.update(value=None, visible=True)
@@ -46,20 +54,17 @@ def clear_ui_and_reset_states():
 # --- FUNCIONES AUXILIARES DE FLUJO ---
 def process_image(image):
-    """Ejecuta el OCR y el preprocesamiento inicial."""
     if image is None:
-        # Añadir None para image_filename_state en el retorno de error
         return None, [], None, [], "Sube una imagen para comenzar...", gr.update(visible=True), gr.update(visible=False), None
     try:
-        # process_and_setup ahora retorna: image_orig, tokens_data, highlighted_image, df_data, status, image_filename
         result = process_and_setup(image)
         if result[0] is None:
-            # Añadir None para image_filename_state en el retorno de error
             return None, [], None, [], "Error en el procesamiento del OCR. Verifica logs.", gr.update(visible=True), gr.update(visible=False, value=None), None
-        # Desempaquetar el resultado
         image_orig, tokens_data, highlighted_image, df_data, status, image_filename = result
         # Convertir datos para el DataFrame de Gradio (lista de listas)
@@ -81,7 +86,6 @@ def process_image(image):
     except Exception as e:
         print(f"Error en process_image: {str(e)}")
-        # Asegurar que la función siempre retorne el número correcto de outputs
         return None, [], None, [], f"Error: {str(e)}", gr.update(visible=True), gr.update(visible=False, value=None), None
 def capture_highlight_index(evt: gr.SelectData):
@@ -96,21 +100,20 @@ def capture_highlight_index(evt: gr.SelectData):
 with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
     gr.Markdown(
         """
-        # 🧾 Anotador NER para Facturas Argentinas (LayoutXLM)
-        **Instrucciones:**
-        1. **Sube** una imagen. La imagen se guarda automáticamente en `dataset/imagenes`.
-        2. **Haz clic en una FILA** de la tabla para **seleccionar** el token y mostrar los editores.
-        3. El cambio de la **Etiqueta NER** (Dropdown) y la edición del **Token** (Textbox) se aplican **automáticamente**.
-        4. Al exportar, la información se **agrega** a `anotacion_factura.json`.
         """
     )
     # Componentes de estado
     image_orig_state = gr.State(None)
-    tokens_data_state = gr.State([]) # Estado de la verdad que incluye tokens y bboxes
-    highlight_index_state = gr.State(-1) # Índice de la fila seleccionada
-    image_filename_state = gr.State(None) # Nombre de archivo único para la exportación
     with gr.Row():
         with gr.Column(scale=1):
@@ -139,83 +142,97 @@ with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
             # Columna Derecha: Edición de Etiquetas
             gr.Markdown("### 2. Edición de Etiquetas NER")
-            # CAPTURAR COMPONENTES (df_label_input, tb_token_editor, dd_tag_selector, btn_export, file_output)
-            df_label_input, tb_token_editor, dd_tag_selector, btn_export, file_output = setup_label_components()
-            # Contenedor para los editores (Token y Tag en dos columnas contiguas)
             with gr.Row(visible=True) as editor_row:
                 with gr.Column(scale=2):
-                    tb_token_editor # Textbox a la izquierda (más ancho)
                 with gr.Column(scale=1):
-                    dd_tag_selector # Dropdown a la derecha (más estrecho)
     # --- CONEXIONES DE EVENTOS ---
-    # CONEXIÓN 1: EJECUTAR OCR AUTOMÁTICAMENTE AL CARGAR IMAGEN
     image_input_file.change(
         fn=process_image,
         inputs=[image_input_file],
         outputs=[
             image_orig_state, tokens_data_state, image_output_display, df_label_input, status_output,
-            image_input_file, image_output_display, image_filename_state # Nuevo estado
         ],
         api_name=False
     )
-    # CONEXIÓN 2: Al hacer clic en una FILA (Selección/Resaltado)
     df_label_input.select(
         fn=capture_highlight_index,
         inputs=None,
         outputs=[highlight_index_state],
         queue=False
     ).then(
-        # Paso A: Mostrar el token y la etiqueta en los editores externos
         fn=display_selected_row,
         inputs=[tokens_data_state, highlight_index_state],
         outputs=[tb_token_editor, dd_tag_selector, highlight_index_state],
     ).then(
-        # Paso B: Resaltar la fila en la imagen
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
-    # CONEXIÓN 3.1: Dropdown cambia la etiqueta NER (Actualización Automática con .change())
     dd_tag_selector.change(
-        # Capturar el valor actual del Dropdown para pasarlo a la función de actualización
         fn=lambda t, d, i, new_tag_val: update_dataframe_and_state(t, d, new_tag_val, None, i, 'tag'),
         inputs=[tokens_data_state, df_label_input, highlight_index_state, dd_tag_selector],
         outputs=[tokens_data_state, df_label_input],
     ).then(
-        # Refrescar la imagen con el resaltado actualizado
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
-    # CONEXIÓN 3.2: Textbox cambia el Token (Actualización Automática con .blur y .submit)
     token_update_events = [tb_token_editor.blur, tb_token_editor.submit]
     for event in token_update_events:
         event(
-            # Capturar el valor actual del Textbox
             fn=lambda t, d, i, new_token_val: update_dataframe_and_state(t, d, None, new_token_val, i, 'token'),
             inputs=[tokens_data_state, df_label_input, highlight_index_state, tb_token_editor],
             outputs=[tokens_data_state, df_label_input],
         ).then(
-            # Refrescar la imagen
             fn=update_ui,
             inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
             outputs=[tokens_data_state, image_output_display],
             api_name=False
         )
-    # CONEXIÓN 4: Exportar (Añadiendo image_filename_state)
     btn_export.click(
-        fn=export_data,
         inputs=[image_orig_state, tokens_data_state, image_filename_state],
         outputs=[file_output, status_output],
         api_name=False
@@ -229,7 +246,7 @@ with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
             image_orig_state,
             tokens_data_state,
             highlight_index_state,
-            image_filename_state, # Nuevo output de limpieza
             image_input_file,
             image_output_display,
             df_label_input,

 import gradio as gr
+from image_loader import setup_image_components # Asume que tienes este módulo
 from ocr_processor import setup_tesseract, process_and_setup
+from label_editor import (
+    setup_label_components,
+    update_ui,
+    save_current_annotation_to_json, # Nueva función de guardado
+    export_and_zip_dataset,        # Nueva función de exportación
+    update_dataframe_and_state,
+    display_selected_row,
+    ALL_NER_TAGS
+)
 # Configurar Tesseract al inicio
 setup_tesseract()
+# --- Función de Limpieza ---
 def clear_ui_and_reset_states():
     """Limpia los componentes de la interfaz y resetea los estados."""
     reset_image_orig_state = None
     reset_tokens_data_state = []
     reset_highlight_index_state = -1
+    reset_image_filename_state = None
     # Actualizaciones para los componentes de la interfaz
     image_input_update = gr.update(value=None, visible=True)
 # --- FUNCIONES AUXILIARES DE FLUJO ---
 def process_image(image):
+    """Ejecuta el OCR y el preprocesamiento inicial, guardando la imagen."""
     if image is None:
         return None, [], None, [], "Sube una imagen para comenzar...", gr.update(visible=True), gr.update(visible=False), None
     try:
+        # process_and_setup retorna: image_orig, tokens_data, highlighted_image, df_data, status, image_filename
         result = process_and_setup(image)
         if result[0] is None:
             return None, [], None, [], "Error en el procesamiento del OCR. Verifica logs.", gr.update(visible=True), gr.update(visible=False, value=None), None
         image_orig, tokens_data, highlighted_image, df_data, status, image_filename = result
         # Convertir datos para el DataFrame de Gradio (lista de listas)
     except Exception as e:
         print(f"Error en process_image: {str(e)}")
         return None, [], None, [], f"Error: {str(e)}", gr.update(visible=True), gr.update(visible=False, value=None), None
 def capture_highlight_index(evt: gr.SelectData):
 with gr.Blocks(title="Anotador NER de Facturas (LayoutXLM)") as app:
     gr.Markdown(
         """
+        # 🧾 Anotador NER para Facturas (LayoutXLM)
+        **Instrucciones:** 1. **Sube** una imagen. La imagen se guarda automáticamente en `dataset/imagenes`.
+        2. **Edita** los tokens o etiquetas. Los cambios se aplican automáticamente.
+        3. Haz clic en **'Guardar Anotación Actual (JSON)'** para confirmar los datos de la factura actual en `dataset/anotacion_factura.json`.
+        4. Haz clic en **'Descargar Dataset Completo (.zip)'** para obtener todas las imágenes y el JSON consolidado.
         """
     )
     # Componentes de estado
     image_orig_state = gr.State(None)
+    tokens_data_state = gr.State([])
+    highlight_index_state = gr.State(-1)
+    image_filename_state = gr.State(None) # Nombre de archivo único
     with gr.Row():
         with gr.Column(scale=1):
             # Columna Derecha: Edición de Etiquetas
             gr.Markdown("### 2. Edición de Etiquetas NER")
+            # CAPTURAR EL NUEVO BOTÓN: btn_save_annotation
+            df_label_input, tb_token_editor, dd_tag_selector, btn_save_annotation, btn_export, file_output = setup_label_components()
+            # Dataframe
+            df_label_input
+            # Contenedor para los editores (Token y Tag)
             with gr.Row(visible=True) as editor_row:
                 with gr.Column(scale=2):
+                    tb_token_editor
                 with gr.Column(scale=1):
+                    dd_tag_selector
+            # Contenedor para los botones de Guardar/Descargar
+            with gr.Row(visible=True):
+                with gr.Column(scale=1):
+                    btn_save_annotation # NUEVO BOTÓN: Guardar JSON
+                with gr.Column(scale=1):
+                    btn_export # Botón: Descargar ZIP
+            file_output
     # --- CONEXIONES DE EVENTOS ---
+    # CONEXIÓN 1: EJECUTAR OCR
     image_input_file.change(
         fn=process_image,
         inputs=[image_input_file],
         outputs=[
             image_orig_state, tokens_data_state, image_output_display, df_label_input, status_output,
+            image_input_file, image_output_display, image_filename_state
         ],
         api_name=False
     )
+    # CONEXIÓN 2: Selección de FILA
     df_label_input.select(
         fn=capture_highlight_index,
         inputs=None,
         outputs=[highlight_index_state],
         queue=False
     ).then(
         fn=display_selected_row,
         inputs=[tokens_data_state, highlight_index_state],
         outputs=[tb_token_editor, dd_tag_selector, highlight_index_state],
     ).then(
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
+    # CONEXIÓN 3.1: Dropdown cambia la etiqueta NER (Actualización Automática)
     dd_tag_selector.change(
         fn=lambda t, d, i, new_tag_val: update_dataframe_and_state(t, d, new_tag_val, None, i, 'tag'),
         inputs=[tokens_data_state, df_label_input, highlight_index_state, dd_tag_selector],
         outputs=[tokens_data_state, df_label_input],
     ).then(
         fn=update_ui,
         inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
         outputs=[tokens_data_state, image_output_display],
         api_name=False
     )
+    # CONEXIÓN 3.2: Textbox cambia el Token (Actualización Automática)
     token_update_events = [tb_token_editor.blur, tb_token_editor.submit]
     for event in token_update_events:
         event(
             fn=lambda t, d, i, new_token_val: update_dataframe_and_state(t, d, None, new_token_val, i, 'token'),
             inputs=[tokens_data_state, df_label_input, highlight_index_state, tb_token_editor],
             outputs=[tokens_data_state, df_label_input],
         ).then(
             fn=update_ui,
             inputs=[image_orig_state, tokens_data_state, df_label_input, highlight_index_state],
             outputs=[tokens_data_state, image_output_display],
             api_name=False
         )
+    # CONEXIÓN 3.3: Guardar Anotación Actual (JSON)
+    btn_save_annotation.click(
+        fn=save_current_annotation_to_json,
+        inputs=[image_orig_state, tokens_data_state, image_filename_state],
+        outputs=[file_output, status_output],
+        api_name=False
+    )
+    # CONEXIÓN 4: Exportar y Comprimir (ZIP)
     btn_export.click(
+        fn=export_and_zip_dataset,
         inputs=[image_orig_state, tokens_data_state, image_filename_state],
         outputs=[file_output, status_output],
         api_name=False
             image_orig_state,
             tokens_data_state,
             highlight_index_state,
+            image_filename_state,
             image_input_file,
             image_output_display,
             df_label_input,

label_editor.py CHANGED Viewed

@@ -1,21 +1,24 @@
 import gradio as gr
-import os
 import json
 import pandas as pd
-from error_handler import ErrorHandler
-from ner_tags import ALL_NER_TAGS
-# Definir la ruta base del dataset (debe coincidir con ocr_processor.py si es necesario)
 DATASET_BASE_DIR = "dataset"
 JSON_FILENAME = "anotacion_factura.json"
 # --- Funciones de Configuración y UI ---
 def setup_label_components():
     """
-    Configura y retorna los componentes de edición de etiquetas:
-    DataFrame (no interactivo), Textbox para Token, Dropdown para Tag,
-    botón de exportar y salida de archivo.
     """
     # 1. Dataframe NO INTERACTIVO (Solo para visualización y selección de fila)
@@ -26,36 +29,35 @@ def setup_label_components():
         label="Tabla de Tokens y Etiquetas (Haga clic en la FILA para seleccionar y editar abajo)",
         interactive=False, # Deshabilitar la edición directa
         wrap=True,
-        value=[]
     )
-    # 2. NUEVOS COMPONENTES DE EDICIÓN EXTERNOS
-    # Input para editar el token
     tb_token_editor = gr.Textbox(
-        # Eliminamos la instrucción "presionar ENTER" para reflejar el cambio automático
         label="Token Seleccionado",
         interactive=True,
-        visible=False # Oculto por defecto
     )
-    # Dropdown para editar la etiqueta NER
     dd_tag_selector = gr.Dropdown(
         choices=ALL_NER_TAGS,
         label="Etiqueta NER Seleccionada",
         value="O",
         interactive=True,
-        visible=False  # Oculto por defecto
     )
-    # Eliminamos btn_apply_token
-    # Resto de componentes
-    btn_export = gr.Button("Exportar a JSON para Fine-Tuning", variant="secondary")
-    file_output = gr.File(label="Archivo de Anotación JSON")
-    # Retornamos los nuevos componentes (sin btn_apply_token)
-    return df_label_input, tb_token_editor, dd_tag_selector, btn_export, file_output
 # --- FUNCIÓN: Obtener la fila seleccionada y mostrar editores ---
@@ -71,7 +73,6 @@ def display_selected_row(tokens_data: list, highlight_index: int):
         # Muestra los componentes
         visible_update = gr.update(visible=True)
-        # DEVOLVER LA ETIQUETA ACTUAL del token para inicializar el Dropdown correctamente
         return (
             gr.update(value=token, visible=True),        # tb_token_editor
             gr.update(value=ner_tag, visible=True),      # dd_tag_selector
@@ -83,14 +84,14 @@ def display_selected_row(tokens_data: list, highlight_index: int):
     return gr.update(value="", visible=False), hidden_update, -1
-# --- FUNCIÓN: Actualizar el Dataframe y el estado de los tokens (Mantener) ---
-def update_dataframe_and_state(tokens_data: list, df_data_current: list, new_tag: str, new_token: str, row_index: int, update_type: str):
     """
-    Función unificada para actualizar la lista de tokens y el Dataframe.
-    (La lógica de esta función se mantiene sin cambios)
     """
     if isinstance(df_data_current, pd.DataFrame):
         df_list = df_data_current.values.tolist()
     else:
@@ -108,32 +109,37 @@ def update_dataframe_and_state(tokens_data: list, df_data_current: list, new_tag
     return tokens_data, df_list
-# --- FUNCIÓN: Actualizar la UI al cambiar la selección ---
 def update_ui(image_orig, tokens_data: list, df_labels: list, highlight_index: int):
-    from ocr_processor import draw_boxes
     highlighted_image = draw_boxes(image_orig, tokens_data, highlight_index)
     return tokens_data, highlighted_image
-# --- Función de Exportación ---
-def export_data(image_orig, tokens_data: list, image_filename: str): # <-- ACEPTAR image_filename
     """
-    Exporta los datos anotados a un archivo JSON estructurado.
-    Si el archivo existe, agrega las nuevas anotaciones al final.
     """
-    if not tokens_data or not image_filename: # Validar que el nombre del archivo exista
-        ErrorHandler.show_error("Error: No hay datos de anotación o nombre de archivo para exportar.")
-        return None, None
     # 1. Asegurarse de que la carpeta 'dataset' exista
     os.makedirs(DATASET_BASE_DIR, exist_ok=True)
-    # 2. Definir la ruta completa del archivo JSON
-    temp_file = os.path.join(DATASET_BASE_DIR, JSON_FILENAME)
     new_annotations = []
-    # 1. Preparar las nuevas anotaciones en el formato requerido
     for item in tokens_data:
         new_annotations.append({
             'token': item['token'],
@@ -141,63 +147,105 @@ def export_data(image_orig, tokens_data: list, image_filename: str): # <-- ACEPT
             'ner_tag': item['ner_tag']
         })
-    # 2. Preparar el nuevo elemento a agregar al array de anotaciones (el objeto completo de la factura)
-    W, H = image_orig.size
     new_document_entry = {
         'image': {
             'size': [W, H],
-            'name': image_filename # <-- USAR EL NOMBRE ÚNICO
         },
         'annotations': new_annotations
     }
-    # 3. Leer el archivo existente y obtener los datos previos
-    # El archivo JSON principal será un ARRAY de objetos de documentos.
     existing_document_list = []
-    total_annotations_count = 0
     try:
         if os.path.exists(temp_file):
             with open(temp_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-                # ASUMIMOS que el archivo JSON es una lista [] de documentos
                 if isinstance(data, list):
                     existing_document_list = data
-                # Si el archivo está en el formato antiguo {metadata: {}, annotations: []} (solo 1 doc)
-                elif isinstance(data, dict) and 'annotations' in data:
-                    # Lo convertimos a la nueva estructura de lista si solo tiene un documento
-                    # Nota: Esto es peligroso, idealmente el formato de exportación debe ser consistente.
-                    # Asumiremos la nueva estructura JSON será: [ {doc1}, {doc2}, ... ]
-                    pass
-    except json.JSONDecodeError:
-        gr.Warning(f"Advertencia: El archivo {temp_file} existe pero está corrupto/vacío. Se creará uno nuevo.")
     except Exception as e:
-        ErrorHandler.handle_export_error(e)
-        gr.Warning(f"Error al leer el archivo existente. Se agregará solo este documento.")
-    # 4. Consolidar los datos y contar
-    # 4.1. Agregar el nuevo documento a la lista
-    existing_document_list.append(new_document_entry)
-    # 4.2. Contar el total de tokens en todas las facturas
-    for doc in existing_document_list:
-        total_annotations_count += len(doc.get('annotations', []))
-    # 5. Escribir la lista completa de documentos de vuelta al archivo
     try:
-        with open(temp_file, 'w', encoding='utf-8') as f:
-            # Escribir la lista de documentos directamente
-            json.dump(existing_document_list, f, ensure_ascii=False, indent=4)
-        gr.Info(f"✅ Exportación exitosa. Documento '{image_filename}' agregado. Total de documentos: {len(existing_document_list)}. Tokens totales: {total_annotations_count}")
-        return temp_file, f"Exportación JSON completada. Documentos: {len(existing_document_list)}, Tokens: {total_annotations_count}"
     except Exception as e:
         error_msg = ErrorHandler.handle_export_error(e)
-        gr.Warning(f"Error al escribir el archivo: {error_msg}")
-        return None, f"Error en exportación: {error_msg}"

 import gradio as gr
 import json
 import pandas as pd
+import os
+import zipfile
+from error_handler import ErrorHandler # Asume que tienes este módulo
+from ner_tags import ALL_NER_TAGS # Asume que tienes este módulo
+from ocr_processor import draw_boxes # Importación forzada para evitar errores de referencia
+# --- Configuración de Directorios ---
 DATASET_BASE_DIR = "dataset"
 JSON_FILENAME = "anotacion_factura.json"
+TEMP_ZIP_FILENAME = "dataset.zip"
 # --- Funciones de Configuración y UI ---
 def setup_label_components():
     """
+    Configura y retorna los componentes de edición de etiquetas, incluyendo
+    el nuevo botón 'Guardar Anotación'.
     """
     # 1. Dataframe NO INTERACTIVO (Solo para visualización y selección de fila)
         label="Tabla de Tokens y Etiquetas (Haga clic en la FILA para seleccionar y editar abajo)",
         interactive=False, # Deshabilitar la edición directa
         wrap=True,
+        value=[]
     )
+    # 2. Componentes de Edición Externos
     tb_token_editor = gr.Textbox(
         label="Token Seleccionado",
         interactive=True,
+        visible=False
     )
     dd_tag_selector = gr.Dropdown(
         choices=ALL_NER_TAGS,
         label="Etiqueta NER Seleccionada",
         value="O",
         interactive=True,
+        visible=False
     )
+    # 3. Botones y Salida
+    # NUEVO BOTÓN: Para guardar solo la factura actual en el JSON
+    btn_save_annotation = gr.Button("3. Guardar Anotación Actual (JSON)", variant="primary")
+    # Botón de Descargar ZIP (ahora es el paso 4)
+    btn_export = gr.Button("4. Descargar Dataset Completo (.zip)", variant="secondary")
+    file_output = gr.File(label="Archivo ZIP del Dataset (Imágenes + Anotaciones)")
+    # Retornar el nuevo componente
+    return df_label_input, tb_token_editor, dd_tag_selector, btn_save_annotation, btn_export, file_output
 # --- FUNCIÓN: Obtener la fila seleccionada y mostrar editores ---
         # Muestra los componentes
         visible_update = gr.update(visible=True)
         return (
             gr.update(value=token, visible=True),        # tb_token_editor
             gr.update(value=ner_tag, visible=True),      # dd_tag_selector
     return gr.update(value="", visible=False), hidden_update, -1
+# --- FUNCIÓN: Actualizar el Dataframe y el estado de los tokens ---
+def update_dataframe_and_state(tokens_data: list, df_data_current, new_tag: str, new_token: str, row_index: int, update_type: str):
     """
+    Función unificada para actualizar la lista de tokens (estado) y el Dataframe (UI).
     """
+    # Manejar el caso de entrada como Pandas DataFrame (por seguridad)
     if isinstance(df_data_current, pd.DataFrame):
         df_list = df_data_current.values.tolist()
     else:
     return tokens_data, df_list
+# --- Función de Sincronización de UI/Estados ---
 def update_ui(image_orig, tokens_data: list, df_labels: list, highlight_index: int):
+    """Actualiza la imagen resaltada basándose en el estado interno de los tokens."""
+    # Generar la imagen resaltada.
     highlighted_image = draw_boxes(image_orig, tokens_data, highlight_index)
+    # Devolver el estado interno (que ya está actualizado) y la imagen
     return tokens_data, highlighted_image
+# --- FUNCIÓN: Guardar Anotación Actual (JSON) ---
+def save_current_annotation_to_json(image_orig, tokens_data: list, image_filename: str):
     """
+    Guarda la anotación del documento actual en el archivo JSON, sobrescribe si existe.
+    Retorna mensajes de estado a Gradio.
     """
+    if not tokens_data or not image_filename:
+        gr.Warning("Error: No hay tokens o la imagen no fue procesada.")
+        # Retorna el path (vacío) y el mensaje de estado (que se mostrará en status_output)
+        return None, "Guardado fallido: No hay datos de imagen o tokens."
     # 1. Asegurarse de que la carpeta 'dataset' exista
     os.makedirs(DATASET_BASE_DIR, exist_ok=True)
+    temp_file = os.path.join(DATASET_BASE_DIR, JSON_FILENAME)
+    # 2. Preparar el nuevo elemento a agregar
+    W, H = image_orig.size
     new_annotations = []
     for item in tokens_data:
         new_annotations.append({
             'token': item['token'],
             'ner_tag': item['ner_tag']
         })
     new_document_entry = {
         'image': {
             'size': [W, H],
+            'name': image_filename
         },
         'annotations': new_annotations
     }
+    # 3. Leer el archivo existente
     existing_document_list = []
     try:
         if os.path.exists(temp_file):
             with open(temp_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
                 if isinstance(data, list):
                     existing_document_list = data
+    except Exception:
+        # Si falla la lectura, comenzar con una lista vacía
+        pass
+    # 4. Consolidar: Agregar o Sobrescribir el documento actual
+    is_new = True
+    for i, doc in enumerate(existing_document_list):
+        if doc.get('image', {}).get('name') == image_filename:
+            # Documento ya existe (lo editamos), lo sobrescribimos con la versión editada
+            existing_document_list[i] = new_document_entry
+            is_new = False
+            break
+    if is_new:
+        # Es un documento nuevo, lo añadimos al final
+        existing_document_list.append(new_document_entry)
+    # 5. Escribir la lista completa
+    try:
+        with open(temp_file, 'w', encoding='utf-8') as f:
+            json.dump(existing_document_list, f, ensure_ascii=False, indent=4)
+        action_message = "actualizados" if not is_new else "agregados"
+        total_docs = len(existing_document_list)
+        msg = f"Anotación '{image_filename}' {action_message} al JSON. Documentos totales: {total_docs}."
+        gr.Info(f"✅ {msg}")
+        return None, msg
     except Exception as e:
+        error_msg = ErrorHandler.handle_export_error(e)
+        gr.Warning(f"Error al escribir el archivo: {error_msg}")
+        return None, f"Error en guardado: {error_msg}"
+# --- FUNCIÓN PRINCIPAL DE EXPORTACIÓN: ZIP ---
+def export_and_zip_dataset(image_orig, tokens_data: list, image_filename: str):
+    """
+    1. Llama a save_current_annotation_to_json para asegurar que el último documento esté guardado.
+    2. Comprime toda la carpeta 'dataset/' en un archivo ZIP.
+    """
+    # Paso 1: Asegurar que la anotación actual se guarde (para incluir los últimos cambios)
+    # Utilizamos None para evitar que los mensajes de save_current_annotation_to_json sobrescriban el status_output antes del ZIP
+    save_current_annotation_to_json(image_orig, tokens_data, image_filename)
+    # Paso 2: Obtener el total de documentos para el mensaje (si el guardado fue exitoso)
+    json_path = os.path.join(DATASET_BASE_DIR, JSON_FILENAME)
+    total_docs = 0
+    if os.path.exists(json_path):
+        try:
+            with open(json_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    total_docs = len(data)
+        except Exception:
+            pass # Si el archivo es inválido, total_docs = 0
+    if total_docs == 0:
+        gr.Warning("Error: No hay datos guardados para generar el ZIP.")
+        return None, "Error: No se puede generar el ZIP. El archivo JSON está vacío o no existe."
+    # Paso 3: Crear el archivo ZIP
+    zip_path = os.path.join(DATASET_BASE_DIR, TEMP_ZIP_FILENAME)
     try:
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Recorrer todos los archivos y carpetas dentro de DATASET_BASE_DIR
+            for root, dirs, files in os.walk(DATASET_BASE_DIR):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    # La ruta que aparecerá dentro del ZIP (relativa a la carpeta 'dataset')
+                    arcname = os.path.relpath(file_path, DATASET_BASE_DIR)
+                    # Excluir el propio archivo ZIP si ya existía
+                    if file != TEMP_ZIP_FILENAME:
+                        zipf.write(file_path, arcname)
+        gr.Info(f"✅ Dataset listo para descargar. Contiene {total_docs} documentos.")
+        return zip_path, f"Dataset exportado con éxito a {TEMP_ZIP_FILENAME}. Descargue el archivo ZIP. (Total docs: {total_docs})"
     except Exception as e:
         error_msg = ErrorHandler.handle_export_error(e)
+        gr.Warning(f"Error al comprimir el ZIP: {error_msg}")
+        return None, f"Error al comprimir el dataset: {error_msg}"