#! /usr/bin/env python3 import os import gzip import json import re import subprocess import xml.etree.ElementTree as ET from tqdm import tqdm from urllib.request import urlopen, urlretrieve PUBMED_DATASET_BASE_URL = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline" PUBMED_FILE_LIMIT = 10 def get_pubmed_dataset_size(): try: with urlopen(PUBMED_DATASET_BASE_URL) as response: html = response.read().decode("utf-8") files = re.findall(r"(pubmed\d+n\d+)\.xml\.gz(?!\.)", html) unique_files = set(files) return len(unique_files) except Exception as e: print(f"Unable to count PubMed files: {e}") return 0 def download_pubmed_xml(output_dir, num_files=1, year='25'): os.makedirs(output_dir, exist_ok=True) total_dataset_size = get_pubmed_dataset_size() files = [] pbar = tqdm(total=total_dataset_size, desc=f"Downloading {num_files}/{total_dataset_size} files in PubMed dataset") for i in range(1, num_files + 1): filename = f"pubmed{year}n{i:04d}.xml.gz" filepath = os.path.join(output_dir, filename) if not os.path.exists(filepath): urlretrieve(f"{PUBMED_DATASET_BASE_URL}/{filename}", filepath) pbar.update(1) files.append(filepath) pbar.close() return files def parse_pubmed_to_jsonl(xml_files, output_jsonl): with open(output_jsonl, 'w') as out: for xml_file in xml_files: print(f"Parsing {xml_file}...") with gzip.open(xml_file, 'rt', encoding='utf-8') as f: tree = ET.parse(f) root = tree.getroot() for article in tqdm(root.findall('.//PubmedArticle')): pmid_elem = article.find('.//PMID') title_elem = article.find('.//ArticleTitle') abstract_elem = article.find('.//Abstract/AbstractText') if pmid_elem is not None: title = title_elem.text if title_elem is not None else "" abstract = abstract_elem.text if abstract_elem is not None else "" doc = { 'id': pmid_elem.text, 'title': title, 'contents': f"{title} {abstract}".strip() } out.write(json.dumps(doc) + '\n') def download_pubmed(output_jsonl, num_files=1): if os.path.exists(output_jsonl): print(f"Already downloaded PubMed dataset: {output_jsonl}") return xml_dir = os.path.join(os.path.dirname(output_jsonl), '../pubmed-xml') xml_files = download_pubmed_xml(xml_dir, num_files=num_files) parse_pubmed_to_jsonl(xml_files, output_jsonl) def build_index_cmd(input_file, index_dir): return [ "python", "-m", "pyserini.index.lucene", "--collection", "JsonCollection", "--input", os.path.dirname(input_file), "--index", index_dir, "--generator", "DefaultLuceneDocumentGenerator", "--threads", "32", "--storePositions", "--storeDocvectors", "--storeRaw", ] def build_index(input_file, index_dir, cmd_generator=build_index_cmd): if os.path.exists(index_dir) and os.listdir(index_dir): print(f"Skipping existing index: {index_dir}") return os.makedirs(os.path.dirname(index_dir) or '.', exist_ok=True) cmd = cmd_generator(input_file, index_dir) subprocess.run(cmd, check=True) def main(base_data_dir="data", base_index_dir="indexes", num_files=1): corpus_jsonl = os.path.join(base_data_dir, "pubmed", "corpus.jsonl") index_dir = os.path.join(base_index_dir, "pubmed") download_pubmed(corpus_jsonl, num_files=num_files) build_index(corpus_jsonl, index_dir) if __name__ == "__main__": main(num_files=PUBMED_FILE_LIMIT)