Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -63,26 +63,32 @@ def safe_download_pdf(url: str) -> str:
|
|
| 63 |
urllib.request.urlretrieve(url, tmp_path)
|
| 64 |
return tmp_path
|
| 65 |
|
| 66 |
-
|
| 67 |
|
| 68 |
def pdf_to_text(path):
|
| 69 |
doc = fitz.open(path)
|
| 70 |
-
|
| 71 |
|
| 72 |
for page in doc:
|
| 73 |
-
#
|
| 74 |
-
text = page.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
#
|
| 77 |
if len(text.strip()) < 50:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
text = pytesseract.image_to_string(img)
|
| 81 |
|
| 82 |
-
|
| 83 |
|
| 84 |
doc.close()
|
| 85 |
-
return
|
| 86 |
|
| 87 |
|
| 88 |
def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):
|
|
|
|
| 63 |
urllib.request.urlretrieve(url, tmp_path)
|
| 64 |
return tmp_path
|
| 65 |
|
| 66 |
+
import fitz
|
| 67 |
|
| 68 |
def pdf_to_text(path):
|
| 69 |
doc = fitz.open(path)
|
| 70 |
+
pages = []
|
| 71 |
|
| 72 |
for page in doc:
|
| 73 |
+
# Try the best structured extractor
|
| 74 |
+
text = page.get_text("text")
|
| 75 |
+
|
| 76 |
+
# If the text layer is encoded or messy → use textpage extractor
|
| 77 |
+
if len(text.strip()) < 50 or "<<" in text or "/Obj" in text:
|
| 78 |
+
try:
|
| 79 |
+
text = page.get_textpage().extractText()
|
| 80 |
+
except:
|
| 81 |
+
pass
|
| 82 |
|
| 83 |
+
# If still bad, use raw blocks (good for ACM)
|
| 84 |
if len(text.strip()) < 50:
|
| 85 |
+
blocks = page.get_text("blocks")
|
| 86 |
+
text = " ".join(b[4] for b in blocks if isinstance(b[4], str))
|
|
|
|
| 87 |
|
| 88 |
+
pages.append(text)
|
| 89 |
|
| 90 |
doc.close()
|
| 91 |
+
return pages
|
| 92 |
|
| 93 |
|
| 94 |
def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):
|