Spaces:

vithacocf
/

air_flow

Sleeping

App Files Files Community

vithacocf commited on Nov 4

Commit

952d402

verified ·

1 Parent(s): 6649bd1

Update app.py

Browse files

Add detect multi carrier

Files changed (1) hide show

app.py +126 -189

app.py CHANGED Viewed

@@ -1,15 +1,18 @@
 from __future__ import annotations
-import os, io, re, json, time, mimetypes, tempfile
-from typing import List, Union, Tuple, Any
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
-import requests
 import fitz  # PyMuPDF
-import camelot
 import pdfplumber
 # ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
@@ -18,28 +21,27 @@ INTERNAL_MODEL_MAP = {
     "Gemini 2.5 Pro": "gemini-2.5-pro",
 }
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
-Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
-  "shipping_line_reason": "Why this carrier is chosen?",
   "fee_type": "Air Freight",
-  "valid_from": ...,
-  "valid_to": ...,
   "charges": [
     {
       "frequency": "...",
       "package_type": "...",
       "aircraft_type": "...",
-      "direction": "Export or Import or null",
-      "origin": "...",
-      "destination": "...",
       "charge_name": "...",
-      "charge_code": "charge_code": "GCR, DGR, PER, etc. (Use IATA Code DO NOT use flight number)",
-      "charge_code_reason": "...",
-      "cargo_type": "...",
       "currency": "...",
       "transit": "...",
       "transit_time": "...",
       "weight_breaks": {
@@ -50,50 +52,27 @@ Please analyze the freight rate table in the file I provide and convert it into
         "+300kg": ...,
         "+500kg": ...,
         "+1000kg": ...,
-        "other": {
-          key: value
-        },
-        "weight_breaks_reason":"Why chosen weight_breaks?"
       },
-      "remark": "..."
-    }
-  ],
-  "local_charges": [
-    {
-      "charge_name": "...",
-      "charge_code": "...",
-      "unit": "...",
-      "amount": ...,
-      "remark": "..."
     }
   ]
 }
-### Date rules
-- valid_from format:
-  - `DD/MM/YYYY` (if full date)
-  - `01/MM/YYYY` (if month+year only)
-  - `01/01/YYYY` (if year only)
-  - `UFN` if missing
-- valid_to:
-  - exact `DD/MM/YYYY` if present
-  - else `UFN`
-STRICT RULES:
-- ONLY return a single JSON object as specified above.
-- All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
-- If the table shows "RQ" or similar, set value as "RQST".
 - Group same-price destinations into one record separated by "/".
-- Always use IATA code for origin and destination.
-- Flight number (e.g. ZH118) is not charge code.
-- Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
-- If local charges exist, list them.
-- If validity missing, set null.
-- Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
-- Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
 - Replace commas in remarks with semicolons.
-- Only return JSON.
 """
 # ================== HELPERS ==================
 def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
@@ -115,61 +94,56 @@ def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
         mime = "application/pdf"
     return filename, mime or "application/octet-stream"
 def extract_pdf_tables(file_path: str) -> pd.DataFrame:
-    """
-    Extract bảng PDF bằng Camelot (từng trang):
-      - Thử lattice
-      - Nếu thất bại → fallback stream
-      - Gộp tất cả
-    """
-    import camelot
     all_dfs = []
-    # Đếm tổng số trang
-    import fitz
-    total_pages = len(fitz.open(file_path))
-    print(f"📄 Tổng số trang: {total_pages}")
-    for page_no in range(1, total_pages + 1):
-        print(f"🔍 Đang xử lý trang {page_no}...")
-        dfs_this_page = []
-        # --- Thử lattice ---
-        try:
-            tables = camelot.read_pdf(
-                file_path, flavor="lattice",
-                pages=str(page_no), strip_text="\n", line_scale=40
-            )
-            if tables and tables.n > 0:
-                for t in tables:
-                    if t.shape[0] > 0:
-                        dfs_this_page.append(t.df)
-                print(f"✅ Trang {page_no}: Lattice thành công ({tables.n} bảng).")
-        except Exception as e:
-            print(f"⚠️ Trang {page_no} lattice lỗi: {e}")
-        # --- Fallback stream ---
-        if not dfs_this_page:
             try:
-                tables = camelot.read_pdf(
-                    file_path, flavor="stream",
-                    pages=str(page_no), edge_tol=200, row_tol=10
-                )
                 if tables and tables.n > 0:
                     for t in tables:
                         if t.shape[0] > 0:
                             dfs_this_page.append(t.df)
-                    print(f"✅ Trang {page_no}: Stream thành công ({tables.n} bảng).")
             except Exception as e:
-                print(f"❌ Trang {page_no} stream lỗi: {e}")
-        if dfs_this_page:
-            all_dfs.extend(dfs_this_page)
-        else:
-            print(f"🚫 Trang {page_no}: Không phát hiện bảng.")
     if not all_dfs:
-        print("❌ Không tìm thấy bảng trong toàn bộ PDF.")
         return pd.DataFrame()
     df_final = pd.concat(all_dfs, ignore_index=True)
@@ -180,160 +154,123 @@ def extract_pdf_tables(file_path: str) -> pd.DataFrame:
     print(f"✅ Tổng hợp: {len(df_final)} dòng, {len(df_final.columns)} cột.")
     return df_final
 def extract_pdf_note(file_path: str) -> str:
-    """
-    Dùng pdfplumber để lấy phần text cuối tài liệu (note, remark...).
-    Chỉ lấy từ 10 dòng cuối của trang cuối.
-    """
     try:
         with pdfplumber.open(file_path) as pdf:
-            last_page = pdf.pages[-1]
-            text = (last_page.extract_text() or "").strip()
-            lines = text.splitlines()
-            note_text = "\n".join(lines[-12:])  # lấy ~12 dòng cuối
-            print(f"📝 Extracted note text thành công.{note_text}")
             return note_text
     except Exception as e:
         print(f"⚠️ extract_pdf_note lỗi: {e}")
         return ""
 def call_gemini_with_prompt(content_text: str, note_text: str, question: str, model_choice: str, temperature: float, top_p: float):
-    """Gửi bảng + note vào Gemini (ưu tiên prompt tùy chỉnh nếu có)"""
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel(
         model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
-        generation_config={
-            "temperature": float(temperature),
-            "top_p": float(top_p)
-        }
     )
-    # Nếu user không nhập câu hỏi riêng, dùng prompt chuẩn FREIGHT_JSON
     base_prompt = question.strip() if question and question.strip() else PROMPT_FREIGHT_JSON
     prompt = f"""
-                {base_prompt}
-                Below is the extracted CSV data:
-                {content_text}
-                Below are the notes extracted from the PDF (e.g. Valid From, Origin, Remark, Package Type rules):
-                {note_text}
-                Please analyze all data and generate the JSON output following the schema above.
-                """
-    print("🧠 Sending prompt to Gemini...")
-    response = model.generate_content(prompt)
-    result_text = getattr(response, "text", str(response))
-    return result_text
-# ================== MAIN ROUTER ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "❌ No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
         print(f"[UPLOAD] {filename} ({mime})")
         if mime == "application/pdf":
-            # Lưu file tạm để camelot đọc
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                 tmp.write(file_bytes)
                 tmp_path = tmp.name
-            # 1️⃣ Extract bảng bằng Camelot
             df = extract_pdf_tables(tmp_path)
             note_text = extract_pdf_note(tmp_path)
             if not df.empty:
-                csv_text = df.to_csv(index=False)
-                print("✅ Gửi Gemini để sinh JSON...")
-                message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
-                return message, None
             else:
                 print("⚠️ Không có bảng hợp lệ, fallback OCR Gemini.")
                 return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
-        # Các loại file khác → OCR trực tiếp
         return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {e}", None
-def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
-    api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
-    if not api_key:
-        return "ERROR: Missing GOOGLE_API_KEY.", None
-    genai.configure(api_key=api_key)
-    model_name = INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash")
-    model = genai.GenerativeModel(model_name=model_name,
-                                  generation_config={"temperature": float(temperature), "top_p": float(top_p)})
-    if file_bytes[:4] == b"%PDF":
-        pages = pdf_to_images(file_bytes)
-    else:
-        pages = [Image.open(io.BytesIO(file_bytes))]
-    user_prompt = (question or "").strip() or PROMPT_FREIGHT_JSON
-    all_json_results, all_text_results = [], []
-    previous_header_json = None
-    def _safe_text(resp):
-        try:
-            return resp.text
-        except:
-            return ""
     for i in range(0, len(pages), batch_size):
         batch = pages[i:i+batch_size]
-        uploaded = []
-        for im in batch:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
-                im.save(tmp.name)
-                up = genai.upload_file(path=tmp.name, mime_type="image/png")
-                up = genai.get_file(up.name)
-                uploaded.append(up)
-        context_prompt = user_prompt
-        resp = model.generate_content([context_prompt] + uploaded)
-        text = _safe_text(resp)
-        all_text_results.append(text)
-        for up in uploaded:
-            try:
-                genai.delete_file(up.name)
-            except:
-                pass
     return "\n\n".join(all_text_results), None
 # ================== UI ==================
 def main():
-    with gr.Blocks(title="OCR Multi-Agent System") as demo:
         file = gr.File(label="Upload PDF/Image")
-        question = gr.Textbox(label="Prompt", lines=2)
         model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
                                    value="Gemini 2.5 Flash", label="Model")
         temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
         top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
-        external_api_url = gr.Textbox(label="External API URL", visible=False)
-        output_text = gr.Code(label="Output", language="json")
-        run_btn = gr.Button("🚀 Process")
-        run_btn.click(
-            run_process,
-            inputs=[file, question, model_choice, temperature, top_p, external_api_url],
-            outputs=[output_text, gr.State()]
-        )
     return demo
 demo = main()
 if __name__ == "__main__":
     demo.launch()

 from __future__ import annotations
+import os, io, re, json, tempfile, mimetypes
+from typing import Union, Tuple
 from PIL import Image
 import pandas as pd
 import gradio as gr
 import google.generativeai as genai
 import fitz  # PyMuPDF
 import pdfplumber
+try:
+    import camelot
+except Exception:
+    camelot = None
 # ================== CONFIG ==================
 DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
     "Gemini 2.5 Pro": "gemini-2.5-pro",
 }
 EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
 PROMPT_FREIGHT_JSON = """
+Please analyze the freight rate table and convert it into JSON with this schema:
 {
   "shipping_line": "...",
   "shipping_line_code": "...",
   "fee_type": "Air Freight",
+  "valid_from": "...",
+  "valid_to": "...",
   "charges": [
     {
+      "origin": "...",
+      "destination": "...",
       "frequency": "...",
       "package_type": "...",
       "aircraft_type": "...",
+      "direction": "...",
       "charge_name": "...",
+      "charge_code": "GCR, DGR, PER, etc.",
       "currency": "...",
+      "cargo_type": "...",
       "transit": "...",
       "transit_time": "...",
       "weight_breaks": {
         "+300kg": ...,
         "+500kg": ...,
         "+1000kg": ...,
+        "other": { key: value }
       },
+      "remark": "...",
+      "pallet_rule": "...",
+      "additional_cost": "..."
     }
   ]
 }
+### RULES
+- If remark says "SKID shipment: add 10 cents" → add surcharge line (+0.10 USD/kg) for Pallet (GEN & PER)
+- Adjust all weight breaks (+0.1) keeping other keys the same.
+- If remark says "Carton = Pallet" → same rates; no extra surcharge.
+- If remark says "EU +USD0.30/kg and rest +USD0.20/kg" → add 2 surcharge lines.
+- Always record Carton rates as base; generate Pallet rates if mentioned.
 - Group same-price destinations into one record separated by "/".
+- Frequency format: D[1-7]; "Daily" = D1234567.
+- Direction = Export if origin is Vietnam, else Import.
 - Replace commas in remarks with semicolons.
+- Only return valid JSON.
 """
 # ================== HELPERS ==================
 def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
     if upload is None:
         mime = "application/pdf"
     return filename, mime or "application/octet-stream"
+# ================== PDF TABLE EXTRACT ==================
 def extract_pdf_tables(file_path: str) -> pd.DataFrame:
+    """Dùng Camelot trước, fallback pdfplumber nếu lỗi."""
     all_dfs = []
+    try:
+        total_pages = len(fitz.open(file_path))
+        print(f"📄 Tổng số trang: {total_pages}")
+    except:
+        total_pages = 1
+    if camelot is not None:
+        for page_no in range(1, total_pages + 1):
+            print(f"🔍 Đang xử lý trang {page_no}...")
+            dfs_this_page = []
             try:
+                tables = camelot.read_pdf(file_path, flavor="lattice", pages=str(page_no), line_scale=40)
                 if tables and tables.n > 0:
                     for t in tables:
                         if t.shape[0] > 0:
                             dfs_this_page.append(t.df)
+                    print(f"✅ Lattice OK ({tables.n} bảng).")
             except Exception as e:
+                print(f"⚠️ Lattice lỗi: {e}")
+            if not dfs_this_page:
+                try:
+                    tables = camelot.read_pdf(file_path, flavor="stream", pages=str(page_no), edge_tol=200)
+                    if tables and tables.n > 0:
+                        for t in tables:
+                            if t.shape[0] > 0:
+                                dfs_this_page.append(t.df)
+                        print(f"✅ Stream OK ({tables.n} bảng).")
+                except Exception as e:
+                    print(f"❌ Stream lỗi: {e}")
+            if dfs_this_page:
+                all_dfs.extend(dfs_this_page)
+    if not all_dfs:
+        print("⚠️ Camelot không tìm thấy bảng → fallback pdfplumber.")
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                tables = page.extract_tables()
+                for tb in tables:
+                    if tb and len(tb) > 2:
+                        df = pd.DataFrame(tb[1:], columns=tb[0])
+                        all_dfs.append(df)
     if not all_dfs:
+        print("🚫 Không phát hiện bảng trong PDF.")
         return pd.DataFrame()
     df_final = pd.concat(all_dfs, ignore_index=True)
     print(f"✅ Tổng hợp: {len(df_final)} dòng, {len(df_final.columns)} cột.")
     return df_final
+# ================== NOTE EXTRACTION ==================
 def extract_pdf_note(file_path: str) -> str:
     try:
         with pdfplumber.open(file_path) as pdf:
+            text = ""
+            for p in pdf.pages[-2:]:  # lấy 2 trang cuối
+                t = (p.extract_text() or "")
+                text += "\n" + t
+            lines = text.strip().splitlines()
+            note_text = "\n".join(lines[-15:])
+            print(f"📝 Note Extracted: {len(note_text)} chars")
             return note_text
     except Exception as e:
         print(f"⚠️ extract_pdf_note lỗi: {e}")
         return ""
+# ================== GEMINI CALL ==================
 def call_gemini_with_prompt(content_text: str, note_text: str, question: str, model_choice: str, temperature: float, top_p: float):
     api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel(
         model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
+        generation_config={"temperature": float(temperature), "top_p": float(top_p)}
     )
     base_prompt = question.strip() if question and question.strip() else PROMPT_FREIGHT_JSON
     prompt = f"""
+{base_prompt}
+Below is the extracted CSV data:
+{content_text}
+Below are the notes (remark, package type, surcharges, etc.):
+{note_text}
+Please analyze everything and generate a valid JSON in the specified format.
+"""
+    print("🧠 Sending prompt to Gemini...")
+    resp = model.generate_content(prompt)
+    return getattr(resp, "text", str(resp))
+# ================== MAIN PROCESS ==================
 def run_process(file, question, model_choice, temperature, top_p, external_api_url):
     try:
         if file is None:
             return "❌ No file uploaded.", None
         file_bytes = _read_file_bytes(file)
         filename, mime = _guess_name_and_mime(file, file_bytes)
         print(f"[UPLOAD] {filename} ({mime})")
         if mime == "application/pdf":
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                 tmp.write(file_bytes)
                 tmp_path = tmp.name
             df = extract_pdf_tables(tmp_path)
             note_text = extract_pdf_note(tmp_path)
             if not df.empty:
+                # 🔹 Nếu phát hiện nhiều carrier
+                carrier_rows = df[df.iloc[:, 0].astype(str).str.contains("CARRIER", case=False, na=False)].index.tolist()
+                results = []
+                if carrier_rows:
+                    for i, start in enumerate(carrier_rows):
+                        end = carrier_rows[i + 1] if i + 1 < len(carrier_rows) else len(df)
+                        sub_df = df.iloc[start:end]
+                        csv_text = sub_df.to_csv(index=False)
+                        print(f"🚀 Processing carrier block {i+1}/{len(carrier_rows)}...")
+                        message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
+                        results.append(message)
+                    return "\n\n".join(results), None
+                else:
+                    csv_text = df.to_csv(index=False)
+                    print("✅ Gửi Gemini để sinh JSON...")
+                    message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
+                    return message, None
             else:
                 print("⚠️ Không có bảng hợp lệ, fallback OCR Gemini.")
                 return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
+        # fallback nếu không phải PDF
         return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
     except Exception as e:
         return f"ERROR: {type(e).__name__}: {e}", None
+# ================== FALLBACK OCR ==================
+def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    return [Image.frombytes("RGB", [p.get_pixmap(dpi=200).width, p.get_pixmap(dpi=200).height], p.get_pixmap(dpi=200).samples) for p in doc]
+def run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p, batch_size=3):
+    genai.configure(api_key=os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY))
+    model = genai.GenerativeModel(INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
+        generation_config={"temperature": float(temperature), "top_p": float(top_p)})
+    pages = pdf_to_images(file_bytes) if file_bytes[:4] == b"%PDF" else [Image.open(io.BytesIO(file_bytes))]
+    all_text_results = []
     for i in range(0, len(pages), batch_size):
         batch = pages[i:i+batch_size]
+        uploads = [genai.upload_file(path=tempfile.NamedTemporaryFile(delete=False, suffix=".png").name) for _ in batch]
+        resp = model.generate_content([question or PROMPT_FREIGHT_JSON] + uploads)
+        all_text_results.append(getattr(resp, "text", str(resp)))
     return "\n\n".join(all_text_results), None
 # ================== UI ==================
 def main():
+    with gr.Blocks(title="📦 Freight JSON Extractor") as demo:
         file = gr.File(label="Upload PDF/Image")
+        question = gr.Textbox(label="Prompt (optional)", lines=2)
         model_choice = gr.Dropdown(choices=[*INTERNAL_MODEL_MAP.keys(), EXTERNAL_MODEL_NAME],
                                    value="Gemini 2.5 Flash", label="Model")
         temperature = gr.Slider(0.0, 2.0, value=0.2, step=0.05)
         top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01)
+        output = gr.Code(label="Gemini Output", language="json")
+        btn = gr.Button("🚀 Run Extraction")
+        btn.click(run_process, [file, question, model_choice, temperature, top_p, gr.State()], outputs=[output, gr.State()])
     return demo
 demo = main()
 if __name__ == "__main__":
     demo.launch()