Spaces:

roshini7sn
/

CHAT_BOT_PDF_URL

Sleeping

roshini7sn commited on Dec 9, 2025

Commit

04b52b6

verified ·

1 Parent(s): d648b89

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -63,26 +63,32 @@ def safe_download_pdf(url: str) -> str:
     urllib.request.urlretrieve(url, tmp_path)
     return tmp_path
 def pdf_to_text(path):
     doc = fitz.open(path)
-    pages_text = []
     for page in doc:
-        # 1) Best text extractor for academic PDFs
-        text = page.get_textpage().extractText()
-        # 2) If text is empty or garbage → OCR fallback
         if len(text.strip()) < 50:
-            pix = page.get_pixmap(dpi=300)
-            img = Image.open(io.BytesIO(pix.tobytes("png")))
-            text = pytesseract.image_to_string(img)
-        pages_text.append(text)
     doc.close()
-    return pages_text
 def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):

     urllib.request.urlretrieve(url, tmp_path)
     return tmp_path
+import fitz
 def pdf_to_text(path):
     doc = fitz.open(path)
+    pages = []
     for page in doc:
+        # Try the best structured extractor
+        text = page.get_text("text")
+        # If the text layer is encoded or messy → use textpage extractor
+        if len(text.strip()) < 50 or "<<" in text or "/Obj" in text:
+            try:
+                text = page.get_textpage().extractText()
+            except:
+                pass
+        # If still bad, use raw blocks (good for ACM)
         if len(text.strip()) < 50:
+            blocks = page.get_text("blocks")
+            text = " ".join(b[4] for b in blocks if isinstance(b[4], str))
+        pages.append(text)
     doc.close()
+    return pages
 def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):