roshini7sn commited on
Commit
04b52b6
·
verified ·
1 Parent(s): d648b89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -10
app.py CHANGED
@@ -63,26 +63,32 @@ def safe_download_pdf(url: str) -> str:
63
  urllib.request.urlretrieve(url, tmp_path)
64
  return tmp_path
65
 
66
-
67
 
68
  def pdf_to_text(path):
69
  doc = fitz.open(path)
70
- pages_text = []
71
 
72
  for page in doc:
73
- # 1) Best text extractor for academic PDFs
74
- text = page.get_textpage().extractText()
 
 
 
 
 
 
 
75
 
76
- # 2) If text is empty or garbage OCR fallback
77
  if len(text.strip()) < 50:
78
- pix = page.get_pixmap(dpi=300)
79
- img = Image.open(io.BytesIO(pix.tobytes("png")))
80
- text = pytesseract.image_to_string(img)
81
 
82
- pages_text.append(text)
83
 
84
  doc.close()
85
- return pages_text
86
 
87
 
88
  def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):
 
63
  urllib.request.urlretrieve(url, tmp_path)
64
  return tmp_path
65
 
66
+ import fitz
67
 
68
  def pdf_to_text(path):
69
  doc = fitz.open(path)
70
+ pages = []
71
 
72
  for page in doc:
73
+ # Try the best structured extractor
74
+ text = page.get_text("text")
75
+
76
+ # If the text layer is encoded or messy → use textpage extractor
77
+ if len(text.strip()) < 50 or "<<" in text or "/Obj" in text:
78
+ try:
79
+ text = page.get_textpage().extractText()
80
+ except:
81
+ pass
82
 
83
+ # If still bad, use raw blocks (good for ACM)
84
  if len(text.strip()) < 50:
85
+ blocks = page.get_text("blocks")
86
+ text = " ".join(b[4] for b in blocks if isinstance(b[4], str))
 
87
 
88
+ pages.append(text)
89
 
90
  doc.close()
91
+ return pages
92
 
93
 
94
  def text_to_chunks(texts, word_length=WORD_CHUNK_SIZE, start_page=1):