Spaces:

roshini7sn
/

CHAT_BOT_PDF_URL

Sleeping

roshini7sn commited on Dec 9, 2025

Commit

d648b89

verified ·

1 Parent(s): 20826f5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,6 +8,10 @@ import re
 import requests
 import trafilatura
 from bs4 import BeautifulSoup
 import gradio as gr
 import numpy as np
@@ -60,26 +64,25 @@ def safe_download_pdf(url: str) -> str:
     return tmp_path
-def pdf_to_text(path: str, start_page: int = 1, end_page: int | None = None):
-    """
-    Extract per-page text from a PDF using PyMuPDF, with light cleaning.
-    Returns a list of strings (one per page).
-    """
     doc = fitz.open(path)
-    total_pages = doc.page_count
-    if end_page is None:
-        end_page = total_pages
-    texts = []
-    for i in range(start_page - 1, end_page):
-        page = doc.load_page(i)
-        txt = page.get_text("text")
-        txt = preprocess(txt)
-        texts.append(txt)
     doc.close()
-    return texts
 def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):

 import requests
 import trafilatura
 from bs4 import BeautifulSoup
+import fitz
+import pytesseract
+from PIL import Image
+import io
 import gradio as gr
 import numpy as np
     return tmp_path
+def pdf_to_text(path):
     doc = fitz.open(path)
+    pages_text = []
+    for page in doc:
+        # 1) Best text extractor for academic PDFs
+        text = page.get_textpage().extractText()
+        # 2) If text is empty or garbage → OCR fallback
+        if len(text.strip()) < 50:
+            pix = page.get_pixmap(dpi=300)
+            img = Image.open(io.BytesIO(pix.tobytes("png")))
+            text = pytesseract.image_to_string(img)
+        pages_text.append(text)
     doc.close()
+    return pages_text
 def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):