| | from fastapi import HTTPException, APIRouter, Request
|
| | from pydantic import BaseModel
|
| | import requests
|
| | from time import sleep
|
| | import spacy
|
| | from pymongo import MongoClient
|
| |
|
| |
|
| | nlp = spacy.load("en_core_sci_sm")
|
| |
|
| | router = APIRouter()
|
| |
|
| |
|
| | class SearchRequest(BaseModel):
|
| | topic: str
|
| | year: int
|
| | userId: str
|
| |
|
| |
|
| |
|
| |
|
| | def extract_keywords(text):
|
| | doc = nlp(text.lower())
|
| | noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
|
| | individual_tokens = [
|
| | token.text.strip() for token in doc
|
| | if token.pos_ in ["NOUN", "VERB"] and not token.is_stop
|
| | ]
|
| | keywords = set(noun_chunks + individual_tokens)
|
| |
|
| | cleaned_keywords = set()
|
| | for keyword in keywords:
|
| | if not any(keyword in chunk and keyword != chunk for chunk in noun_chunks):
|
| | cleaned_keywords.add(keyword)
|
| | return sorted(list(cleaned_keywords))
|
| |
|
| |
|
| | def fetch_works(refined_query, year, per_page=200):
|
| | OPENALEX_API_URL = f"https://api.openalex.org/works"
|
| | params = {
|
| | "filter": f"title_and_abstract.search:{refined_query},publication_year:{year}",
|
| | "per_page": per_page,
|
| | "cursor": "*"
|
| | }
|
| | all_results = []
|
| |
|
| | while True:
|
| | response = requests.get(OPENALEX_API_URL, params=params)
|
| | if response.status_code != 200:
|
| | raise HTTPException(status_code=response.status_code, detail="Error fetching data from OpenAlex")
|
| |
|
| | data = response.json()
|
| | all_results.extend(data.get("results", []))
|
| |
|
| | next_cursor = data.get("meta", {}).get("next_cursor")
|
| | if next_cursor:
|
| | params["cursor"] = next_cursor
|
| | else:
|
| | break
|
| |
|
| | sleep(0.2)
|
| |
|
| | return all_results
|
| |
|
| |
|
| | def batch_fetch_host_org_details(source_ids):
|
| | host_org_map = {}
|
| | batch_size = 100
|
| | for i in range(0, len(source_ids), batch_size):
|
| | batch = source_ids[i:i + batch_size]
|
| | openalex_ids = [src_id.split("/")[-1] for src_id in batch]
|
| | filter_query = "|".join(openalex_ids)
|
| | url = f"https://api.openalex.org/sources?filter=openalex_id:{filter_query}"
|
| | response = requests.get(url)
|
| | if response.status_code == 200:
|
| | sources = response.json().get("results", [])
|
| | for source in sources:
|
| | source_id = source.get("id")
|
| |
|
| | host_org_id = source.get("host_organization", "unknown")
|
| | host_org_name = source.get("host_organization_name", "unknown")
|
| |
|
| | host_org_map[source_id] = {
|
| | "host_organization_name": host_org_name,
|
| | "host_organization_id": host_org_id
|
| | }
|
| | sleep(0.1)
|
| | else:
|
| | raise HTTPException(status_code=response.status_code, detail="Error fetching host organization details")
|
| | return host_org_map
|
| |
|
| |
|
| | def extract_metadata(works):
|
| | source_ids = list({
|
| | work["primary_location"]["source"]["id"]
|
| | for work in works
|
| | if work.get("primary_location") and work["primary_location"].get("source")
|
| | })
|
| |
|
| |
|
| | host_org_map = batch_fetch_host_org_details(source_ids)
|
| |
|
| | metadata = []
|
| | for work in works:
|
| | primary_location = work.get("primary_location", {}) or {}
|
| | source = primary_location.get("source", {}) or {}
|
| |
|
| | source_id = source.get("id")
|
| | host_org_details = host_org_map.get(source_id, {"host_organization_name": "Unknown", "host_organization_id": "Unknown"})
|
| |
|
| |
|
| | work_type = (
|
| | work.get("type") or
|
| | primary_location.get("type") or
|
| | source.get("type") or
|
| | "unknown"
|
| | )
|
| |
|
| | metadata.append({
|
| | "id": work.get("id"),
|
| | "is_oa": work.get("open_access", {}).get("is_oa", False),
|
| | "oa_status": work.get("open_access", {}).get("oa_status", "unknown"),
|
| | "venue": source.get("display_name", "unknown"),
|
| | "venue_id": source_id or "unknown",
|
| | "host_organization_name": host_org_details["host_organization_name"],
|
| | "host_organization_id": host_org_details["host_organization_id"],
|
| | "type": work_type,
|
| | "publication_date": work.get("publication_date", "unknown"),
|
| | })
|
| | return metadata
|
| |
|
| |
|
| | def clean_metadata(metadata):
|
| | initial_count = len(metadata)
|
| | cleaned_metadata = [
|
| | entry for entry in metadata
|
| | if not (entry["venue"] == "unknown" and entry["host_organization_name"] == "Unknown")
|
| | ]
|
| | final_count = len(cleaned_metadata)
|
| | print(f"Total papers before cleaning: {initial_count}")
|
| | print(f"Total papers after cleaning: {final_count}")
|
| | return cleaned_metadata
|
| |
|
| |
|
| | async def save_to_mongodb(userId, topic, year, metadata,request:Request):
|
| | document = {
|
| | "userId": userId,
|
| | "topic": topic,
|
| | "year": year,
|
| | "metadata": metadata
|
| | }
|
| | collection = request.app.state.collection2
|
| | await collection.update_one(
|
| | {"userId": userId, "topic": topic, "year": year},
|
| | {"$set": document},
|
| | upsert=True
|
| | )
|
| | print(f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}")
|
| |
|
| |
|
| | @router.post("/search/")
|
| | async def search(data:SearchRequest,request: Request):
|
| | userId = data.userId
|
| | topic = data.topic
|
| | year = data.year
|
| |
|
| |
|
| | keywords = extract_keywords(topic)
|
| | refined_query = "+".join(keywords)
|
| | print(f"Using refined search query: {refined_query}")
|
| |
|
| |
|
| | works = fetch_works(refined_query, year)
|
| | if not works:
|
| | raise HTTPException(status_code=404, detail="No works found for the given query")
|
| |
|
| |
|
| | metadata = extract_metadata(works)
|
| |
|
| |
|
| | cleaned_metadata = clean_metadata(metadata)
|
| |
|
| |
|
| | await save_to_mongodb(userId, topic, year, cleaned_metadata,request)
|
| |
|
| |
|
| | return {"message": f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}", "metadata": cleaned_metadata}
|
| |
|
| | @router.post("/check-data-exists-venue/")
|
| | async def check_data_exists(request_data: SearchRequest, request:Request):
|
| |
|
| | query = {
|
| | "userId": request_data.userId,
|
| | "topic": request_data.topic
|
| | }
|
| |
|
| |
|
| | if request_data.year:
|
| | query["year"] = request_data.year
|
| |
|
| | collection = request.app.state.collection2
|
| |
|
| | document = await collection.find_one(query)
|
| |
|
| |
|
| | return {
|
| | "exists": document is not None,
|
| | "message": "Data found" if document else "Data not found"
|
| | }
|
| |
|
| |
|
| |
|