vithacocf commited on
Commit
5d887ea
·
verified ·
1 Parent(s): 2afa416

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -147
app.py CHANGED
@@ -1,227 +1,180 @@
1
- import os, io, tempfile, mimetypes, camelot, pdfplumber, pandas as pd, google.generativeai as genai
2
- import re
 
 
 
3
  import gradio as gr
 
 
 
 
 
 
 
4
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
 
5
  INTERNAL_MODEL_MAP = {
6
  "Gemini 2.5 Flash": "gemini-2.5-flash",
7
  "Gemini 2.5 Pro": "gemini-2.5-pro",
8
  }
 
 
9
  PROMPT_FREIGHT_JSON = """
10
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
11
  {
12
  "shipping_line": "...",
13
  "shipping_line_code": "...",
14
- "shipping_line_reason": "Why this carrier is chosen?",
15
  "fee_type": "Air Freight",
16
  "valid_from": ...,
17
  "valid_to": ...,
18
- "charges": [
19
- {
20
- "frequency": "...",
21
- "package_type": "...",
22
- "aircraft_type": "...",
23
- "direction": "Export or Import or null",
24
- "origin": "...",
25
- "destination": "...",
26
- "charge_name": "...",
27
- "charge_code": "...",
28
- "charge_code_reason": "...",
29
- "cargo_type": "...",
30
- "currency": "...",
31
- "transit": "...",
32
- "transit_time": "...",
33
- "weight_breaks": {
34
- "M": ...,
35
- "N": ...,
36
- "+45kg": ...,
37
- "+100kg": ...,
38
- "+300kg": ...,
39
- "+500kg": ...,
40
- "+1000kg": ...,
41
- "other": {
42
- key: value
43
- },
44
- "weight_breaks_reason":"Why chosen weight_breaks?"
45
- },
46
- "remark": "..."
47
- }
48
- ],
49
- "local_charges": [
50
- {
51
- "charge_name": "...",
52
- "charge_code": "...",
53
- "unit": "...",
54
- "amount": ...,
55
- "remark": "..."
56
- }
57
- ]
58
  }
59
  ### Date rules
60
- - valid_from format:
61
- - `DD/MM/YYYY` (if full date)
62
- - `01/MM/YYYY` (if month+year only)
63
- - `01/01/YYYY` (if year only)
64
- - `UFN` if missing
65
- - valid_to:
66
- - exact `DD/MM/YYYY` if present
67
- - else `UFN`
68
  STRICT RULES:
69
- - ONLY return a single JSON object as specified above.
70
- - All rates must exactly match the corresponding weight break columns (M,N,45kg, 100kg, 300kg, 500kg, 1000kg, etc.). set null if N/A. No assumptions or interpolations.
71
- - If the table shows "RQ" or similar, set value as "RQST".
72
- - Group same-price destinations into one record separated by "/".
73
- - Always use IATA code for origin and destination.
74
- - Flight number (e.g. ZH118) is not charge code.
75
- - Frequency: D[1-7]; 'Daily' = D1234567. Join multiple (e.g. D3,D4→D34).
76
- - If local charges exist, list them.
77
- - If validity missing, set null.
78
- - Direction: Export if origin is Vietnam (SGN, HAN, DAD...), else Import.
79
- - Provide short plain English reasons for "shipping_line_reason" & "charge_code_reason".
80
- - Replace commas in remarks with semicolons.
81
- - Only return JSON.
82
  """
83
 
84
- # ========== Helpers ==========
85
- def _read_file_bytes(upload):
86
- if isinstance(upload, str):
87
- with open(upload, "rb") as f: return f.read()
88
- elif hasattr(upload, "read"):
 
 
 
 
 
 
89
  return upload.read()
90
- raise TypeError("Unsupported file input")
91
 
92
- def _guess_name_and_mime(file, file_bytes):
93
  filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
94
  mime, _ = mimetypes.guess_type(filename)
95
- if not mime and file_bytes[:4] == b"%PDF": mime = "application/pdf"
 
96
  return filename, mime or "application/octet-stream"
97
 
98
- def check_pdf_structure(file_bytes: bytes) -> bool:
99
- try:
100
- with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
101
- if len(pdf.pages) <= 2: return False
102
- for page in pdf.pages[:3]:
103
- if page.find_tables(): return True
104
- return False
105
- except Exception as e:
106
- print("PDF check error:", e); return False
107
-
108
- # ========== 1️⃣ Extract bảng bằng Camelot ==========
109
  def extract_pdf_tables(file_path: str) -> pd.DataFrame:
 
 
 
 
 
 
110
  all_dfs = []
 
 
111
  try:
112
- print("🔍 Try lattice mode...")
113
  tables = camelot.read_pdf(file_path, flavor="lattice", pages="all")
114
- if tables.n > 0:
115
- for t in tables: all_dfs.append(t.df)
116
- print(f"✅ Lattice: {tables.n} tables.")
 
117
  except Exception as e:
118
- print(f"⚠️ Lattice failed: {e}")
119
 
 
120
  if not all_dfs:
121
  try:
122
- print("🔁 Try stream mode...")
123
  tables = camelot.read_pdf(file_path, flavor="stream", pages="all")
124
- if tables.n > 0:
125
- for t in tables: all_dfs.append(t.df)
126
- print(f"✅ Stream: {tables.n} tables.")
 
127
  except Exception as e:
128
- print(f"❌ Stream failed: {e}")
129
 
130
  if not all_dfs:
131
- print("🚫 No table detected.")
132
  return pd.DataFrame()
133
 
 
134
  df_final = pd.concat(all_dfs, ignore_index=True)
 
 
135
  if all(str(c).isdigit() for c in df_final.columns):
136
- print("🧠 Detected numeric headers (0,1,2..), using first row as real header.")
137
  df_final.columns = df_final.iloc[0]
138
  df_final = df_final[1:]
 
139
  df_final = df_final.dropna(how="all").reset_index(drop=True)
140
- print(f"✅ Total: {len(df_final)} rows × {len(df_final.columns)} columns.")
141
  return df_final
142
 
143
- # ========== 2️⃣ Extract phần Note / Header ==========
144
- def extract_pdf_note(file_bytes: bytes) -> str:
145
  """
146
- Lấy phần text đầu PDF (ví dụ: Start Date, Expiry Date, Origin, các note nhỏ)
147
- Bỏ qua vùng bảng phía dưới.
148
  """
149
  try:
150
- with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
151
- first_page = pdf.pages[0]
152
- text = first_page.extract_text() or ""
153
- # cắt phần note: chỉ lấy 15 dòng đầu để tránh trích luôn bảng
154
- lines = text.splitlines()[:15]
155
- note_lines = []
156
- for line in lines:
157
- if re.search(r"(Start Date|Origin|Expiry|Product|MY|SC|All rates|Currency)", line, re.I):
158
- note_lines.append(line.strip())
159
- note_text = " ".join(note_lines)
160
- return note_text.strip()
161
  except Exception as e:
162
- print(f"⚠️ Note extraction failed: {e}")
163
  return ""
164
 
165
- # ========== 3️⃣ Gọi Gemini ==========
166
- def call_gemini_with_prompt(csv_text: str, note_text: str, model_choice: str, temperature: float, top_p: float):
167
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
168
  genai.configure(api_key=api_key)
169
  model = genai.GenerativeModel(
170
  model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
171
  generation_config={"temperature": temperature, "top_p": top_p}
172
  )
173
- prompt = f"""{PROMPT_FREIGHT_JSON}
174
- Below is the extracted freight rate table (CSV) and additional notes:
175
- Notes:
176
- {note_text or '[No notes detected]'}
177
- CSV:
178
- {csv_text}
179
- → Convert to valid JSON as per schema above.
180
- """
181
- resp = model.generate_content(prompt)
182
- return getattr(resp, "text", str(resp))
183
 
184
- # ========== 4️⃣ Main process ==========
185
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
186
  try:
187
  if file is None:
188
  return "❌ No file uploaded.", None
 
189
  file_bytes = _read_file_bytes(file)
190
  filename, mime = _guess_name_and_mime(file, file_bytes)
191
  print(f"[UPLOAD] {filename} ({mime})")
192
 
193
- if mime == "application/pdf" and check_pdf_structure(file_bytes):
194
- print("➡️ PDF has multi-page table extract before Gemini.")
195
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
196
  tmp.write(file_bytes)
197
  tmp_path = tmp.name
198
 
 
199
  df = extract_pdf_tables(tmp_path)
 
 
200
  if not df.empty:
201
- note_text = extract_pdf_note(file_bytes)
202
  csv_text = df.to_csv(index=False)
203
- print("✅ Send table + note to Gemini...")
204
- message = call_gemini_with_prompt(csv_text, note_text, model_choice, temperature, top_p)
205
  return message, None
206
  else:
207
- print("⚠️ No valid table found fallback to OCR Gemini.")
208
-
209
- # fallback OCR
210
- api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
211
- genai.configure(api_key=api_key)
212
- model = genai.GenerativeModel(
213
- model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
214
- generation_config={"temperature": temperature, "top_p": top_p}
215
- )
216
- uploaded = genai.upload_file(path=file.name)
217
- resp = model.generate_content([PROMPT_FREIGHT_JSON, uploaded])
218
- genai.delete_file(uploaded.name)
219
- return getattr(resp, "text", str(resp)), None
220
 
221
  except Exception as e:
222
  return f"ERROR: {type(e).__name__}: {e}", None
223
 
224
- # ================== UI ==================
225
  def main():
226
  with gr.Blocks(title="OCR Multi-Agent System") as demo:
227
  file = gr.File(label="Upload PDF/Image")
@@ -242,9 +195,8 @@ def main():
242
 
243
  return demo
244
 
 
245
  demo = main()
246
 
247
  if __name__ == "__main__":
248
- import os
249
- os.system("which gs || echo '⚠️ ghostscript (gs) not found in PATH'")
250
- demo.launch()
 
1
+ from __future__ import annotations
2
+ import os, io, re, json, time, mimetypes, tempfile
3
+ from typing import List, Union, Tuple, Any
4
+ from PIL import Image
5
+ import pandas as pd
6
  import gradio as gr
7
+ import google.generativeai as genai
8
+ import requests
9
+ import fitz # PyMuPDF
10
+ import camelot
11
+ import pdfplumber
12
+
13
+ # ================== CONFIG ==================
14
  DEFAULT_API_KEY = "AIzaSyBbK-1P3JD6HPyE3QLhkOps6_-Xo3wUFbs"
15
+
16
  INTERNAL_MODEL_MAP = {
17
  "Gemini 2.5 Flash": "gemini-2.5-flash",
18
  "Gemini 2.5 Pro": "gemini-2.5-pro",
19
  }
20
+ EXTERNAL_MODEL_NAME = "prithivMLmods/Camel-Doc-OCR-062825 (External)"
21
+
22
  PROMPT_FREIGHT_JSON = """
23
  Please analyze the freight rate table in the file I provide and convert it into JSON in the following structure:
24
  {
25
  "shipping_line": "...",
26
  "shipping_line_code": "...",
 
27
  "fee_type": "Air Freight",
28
  "valid_from": ...,
29
  "valid_to": ...,
30
+ "charges": [...],
31
+ "local_charges": [...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
  ### Date rules
34
+ - valid_from: DD/MM/YYYY, 01/MM/YYYY, 01/01/YYYY or 'UFN'
35
+ - valid_to: DD/MM/YYYY or 'UFN'
 
 
 
 
 
 
36
  STRICT RULES:
37
+ - Only return JSON, no explanation.
38
+ - All rates must match the weight break columns (M,N,45kg,100kg,...).
39
+ - Use IATA code for origin/destination.
40
+ - Direction: Export if origin in Vietnam, else Import.
41
+ - Combine with “notes” at bottom of PDF if relevant.
 
 
 
 
 
 
 
 
42
  """
43
 
44
+ # ================== HELPERS ==================
45
+ def _read_file_bytes(upload: Union[str, os.PathLike, dict, object] | None) -> bytes:
46
+ if upload is None:
47
+ raise ValueError("No file uploaded.")
48
+ if isinstance(upload, (str, os.PathLike)):
49
+ with open(upload, "rb") as f:
50
+ return f.read()
51
+ if isinstance(upload, dict) and "path" in upload:
52
+ with open(upload["path"], "rb") as f:
53
+ return f.read()
54
+ if hasattr(upload, "read"):
55
  return upload.read()
56
+ raise TypeError(f"Unsupported file object: {type(upload)}")
57
 
58
+ def _guess_name_and_mime(file, file_bytes: bytes) -> Tuple[str, str]:
59
  filename = os.path.basename(file.name if hasattr(file, "name") else str(file))
60
  mime, _ = mimetypes.guess_type(filename)
61
+ if not mime and file_bytes[:4] == b"%PDF":
62
+ mime = "application/pdf"
63
  return filename, mime or "application/octet-stream"
64
 
 
 
 
 
 
 
 
 
 
 
 
65
  def extract_pdf_tables(file_path: str) -> pd.DataFrame:
66
+ """
67
+ Extract bảng PDF bằng Camelot:
68
+ - thử mode lattice (bảng có khung line)
69
+ - fallback stream (bảng không có line rõ)
70
+ Trả về DataFrame hợp nhất.
71
+ """
72
  all_dfs = []
73
+
74
+ # --- Thử lattice trước ---
75
  try:
76
+ print("🔍 Thử extract bằng lattice...")
77
  tables = camelot.read_pdf(file_path, flavor="lattice", pages="all")
78
+ if tables and tables.n > 0:
79
+ for t in tables:
80
+ all_dfs.append(t.df)
81
+ print(f"✅ Lattice: {tables.n} bảng phát hiện.")
82
  except Exception as e:
83
+ print(f"⚠️ Lattice lỗi: {e}")
84
 
85
+ # --- Fallback stream ---
86
  if not all_dfs:
87
  try:
88
+ print("🔁 Thử extract bằng stream...")
89
  tables = camelot.read_pdf(file_path, flavor="stream", pages="all")
90
+ if tables and tables.n > 0:
91
+ for t in tables:
92
+ all_dfs.append(t.df)
93
+ print(f"✅ Stream: {tables.n} bảng phát hiện.")
94
  except Exception as e:
95
+ print(f"❌ Stream lỗi: {e}")
96
 
97
  if not all_dfs:
98
+ print("🚫 Không phát hiện bảng nào.")
99
  return pd.DataFrame()
100
 
101
+ # --- Hợp nhất tất cả bảng ---
102
  df_final = pd.concat(all_dfs, ignore_index=True)
103
+
104
+ # Nếu header là 0,1,2,... → dùng dòng đầu làm header thật
105
  if all(str(c).isdigit() for c in df_final.columns):
 
106
  df_final.columns = df_final.iloc[0]
107
  df_final = df_final[1:]
108
+
109
  df_final = df_final.dropna(how="all").reset_index(drop=True)
110
+ print(f"✅ Tổng hợp {len(df_final)} dòng, {len(df_final.columns)} cột.")
111
  return df_final
112
 
113
+ def extract_pdf_note(file_path: str) -> str:
 
114
  """
115
+ Dùng pdfplumber để lấy phần text cuối tài liệu (note, remark...).
116
+ Chỉ lấy từ 10 dòng cuối của trang cuối.
117
  """
118
  try:
119
+ with pdfplumber.open(file_path) as pdf:
120
+ last_page = pdf.pages[-1]
121
+ text = (last_page.extract_text() or "").strip()
122
+ lines = text.splitlines()
123
+ note_text = "\n".join(lines[-12:]) # lấy ~12 dòng cuối
124
+ print("📝 Extracted note text thành công.")
125
+ return note_text
 
 
 
 
126
  except Exception as e:
127
+ print(f"⚠️ extract_pdf_note lỗi: {e}")
128
  return ""
129
 
130
+ def call_gemini_with_prompt(content_text: str, note_text: str, question: str, model_choice: str, temperature: float, top_p: float):
131
+ """Gửi bảng + note vào Gemini"""
132
  api_key = os.environ.get("GOOGLE_API_KEY", DEFAULT_API_KEY)
133
  genai.configure(api_key=api_key)
134
  model = genai.GenerativeModel(
135
  model_name=INTERNAL_MODEL_MAP.get(model_choice, "gemini-2.5-flash"),
136
  generation_config={"temperature": temperature, "top_p": top_p}
137
  )
138
+ prompt = f"{PROMPT_FREIGHT_JSON}\n\nBelow is the extracted CSV data:\n{content_text}\n\nBelow are the notes:\n{note_text}\n\n{question or ''}"
139
+ response = model.generate_content(prompt)
140
+ return getattr(response, "text", str(response))
 
 
 
 
 
 
 
141
 
142
+ # ================== MAIN ROUTER ==================
143
  def run_process(file, question, model_choice, temperature, top_p, external_api_url):
144
  try:
145
  if file is None:
146
  return "❌ No file uploaded.", None
147
+
148
  file_bytes = _read_file_bytes(file)
149
  filename, mime = _guess_name_and_mime(file, file_bytes)
150
  print(f"[UPLOAD] {filename} ({mime})")
151
 
152
+ if mime == "application/pdf":
153
+ # Lưu file tạm để camelot đọc
154
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
155
  tmp.write(file_bytes)
156
  tmp_path = tmp.name
157
 
158
+ # 1️⃣ Extract bảng bằng Camelot
159
  df = extract_pdf_tables(tmp_path)
160
+ note_text = extract_pdf_note(tmp_path)
161
+
162
  if not df.empty:
 
163
  csv_text = df.to_csv(index=False)
164
+ print("✅ Gửi Gemini để sinh JSON...")
165
+ message = call_gemini_with_prompt(csv_text, note_text, question, model_choice, temperature, top_p)
166
  return message, None
167
  else:
168
+ print("⚠️ Không bảng hợp lệ, fallback OCR Gemini.")
169
+ return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
170
+
171
+ # Các loại file khác → OCR trực tiếp
172
+ return run_process_internal_base_v2(file_bytes, filename, mime, question, model_choice, temperature, top_p)
 
 
 
 
 
 
 
 
173
 
174
  except Exception as e:
175
  return f"ERROR: {type(e).__name__}: {e}", None
176
 
177
+ # ================== UI ==================
178
  def main():
179
  with gr.Blocks(title="OCR Multi-Agent System") as demo:
180
  file = gr.File(label="Upload PDF/Image")
 
195
 
196
  return demo
197
 
198
+
199
  demo = main()
200
 
201
  if __name__ == "__main__":
202
+ demo.launch()