Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,10 @@ import re
|
|
| 8 |
import requests
|
| 9 |
import trafilatura
|
| 10 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import numpy as np
|
|
@@ -60,26 +64,25 @@ def safe_download_pdf(url: str) -> str:
|
|
| 60 |
return tmp_path
|
| 61 |
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
Extract per-page text from a PDF using PyMuPDF, with light cleaning.
|
| 66 |
-
Returns a list of strings (one per page).
|
| 67 |
-
"""
|
| 68 |
doc = fitz.open(path)
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
for i in range(start_page - 1, end_page):
|
| 76 |
-
page = doc.load_page(i)
|
| 77 |
-
txt = page.get_text("text")
|
| 78 |
-
txt = preprocess(txt)
|
| 79 |
-
texts.append(txt)
|
| 80 |
|
| 81 |
doc.close()
|
| 82 |
-
return
|
| 83 |
|
| 84 |
|
| 85 |
def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):
|
|
|
|
| 8 |
import requests
|
| 9 |
import trafilatura
|
| 10 |
from bs4 import BeautifulSoup
|
| 11 |
+
import fitz
|
| 12 |
+
import pytesseract
|
| 13 |
+
from PIL import Image
|
| 14 |
+
import io
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
import numpy as np
|
|
|
|
| 64 |
return tmp_path
|
| 65 |
|
| 66 |
|
| 67 |
+
|
| 68 |
+
def pdf_to_text(path):
|
|
|
|
|
|
|
|
|
|
| 69 |
doc = fitz.open(path)
|
| 70 |
+
pages_text = []
|
| 71 |
+
|
| 72 |
+
for page in doc:
|
| 73 |
+
# 1) Best text extractor for academic PDFs
|
| 74 |
+
text = page.get_textpage().extractText()
|
| 75 |
|
| 76 |
+
# 2) If text is empty or garbage → OCR fallback
|
| 77 |
+
if len(text.strip()) < 50:
|
| 78 |
+
pix = page.get_pixmap(dpi=300)
|
| 79 |
+
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
| 80 |
+
text = pytesseract.image_to_string(img)
|
| 81 |
|
| 82 |
+
pages_text.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
doc.close()
|
| 85 |
+
return pages_text
|
| 86 |
|
| 87 |
|
| 88 |
def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):
|