roshini7sn commited on
Commit
d648b89
·
verified ·
1 Parent(s): 20826f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -8,6 +8,10 @@ import re
8
  import requests
9
  import trafilatura
10
  from bs4 import BeautifulSoup
 
 
 
 
11
 
12
  import gradio as gr
13
  import numpy as np
@@ -60,26 +64,25 @@ def safe_download_pdf(url: str) -> str:
60
  return tmp_path
61
 
62
 
63
- def pdf_to_text(path: str, start_page: int = 1, end_page: int | None = None):
64
- """
65
- Extract per-page text from a PDF using PyMuPDF, with light cleaning.
66
- Returns a list of strings (one per page).
67
- """
68
  doc = fitz.open(path)
69
- total_pages = doc.page_count
 
 
 
 
70
 
71
- if end_page is None:
72
- end_page = total_pages
 
 
 
73
 
74
- texts = []
75
- for i in range(start_page - 1, end_page):
76
- page = doc.load_page(i)
77
- txt = page.get_text("text")
78
- txt = preprocess(txt)
79
- texts.append(txt)
80
 
81
  doc.close()
82
- return texts
83
 
84
 
85
  def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):
 
8
  import requests
9
  import trafilatura
10
  from bs4 import BeautifulSoup
11
+ import fitz
12
+ import pytesseract
13
+ from PIL import Image
14
+ import io
15
 
16
  import gradio as gr
17
  import numpy as np
 
64
  return tmp_path
65
 
66
 
67
+
68
+ def pdf_to_text(path):
 
 
 
69
  doc = fitz.open(path)
70
+ pages_text = []
71
+
72
+ for page in doc:
73
+ # 1) Best text extractor for academic PDFs
74
+ text = page.get_textpage().extractText()
75
 
76
+ # 2) If text is empty or garbage → OCR fallback
77
+ if len(text.strip()) < 50:
78
+ pix = page.get_pixmap(dpi=300)
79
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
80
+ text = pytesseract.image_to_string(img)
81
 
82
+ pages_text.append(text)
 
 
 
 
 
83
 
84
  doc.close()
85
+ return pages_text
86
 
87
 
88
  def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):