Spaces:

Dpshkh
/

pdf

Running

App Files Files Community

pdf / parser.py

Dpshkh's picture

Upload 8 files

beba6d9 verified 5 months ago

history blame contribute delete

441 Bytes

	from pypdf import PdfReader
	import io

	def extract_text_from_pdf(file_bytes: bytes, max_pages: int = 20):
	reader = PdfReader(io.BytesIO(file_bytes))
	text_chunks = []

	for i, page in enumerate(reader.pages):
	if i >= max_pages:
	break # Stop early to limit memory use
	text = page.extract_text()
	if text:
	text_chunks.append(text)

	return "\n".join(text_chunks)