Spaces:

youdie006
/

simsimi_ai_agent

Runtime error

File size: 49,080 Bytes

065853d

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RPUwOvgUyZiz"
      },
      "source": [
        "requiremenrs.txt\n",
        "\n",
        "langchain\n",
        "langchain-openai\n",
        "langchainhub # langchain python라이브러리로 프롬프트, 에이전트, 체인 관련 패키지 모음\n",
        "langserve[all]\n",
        "\n",
        "faiss-cpu  # Facebook에서 개발 및 배포한 밀집 벡터의 유사도 측정, 클러스터링에 효율적인 라이브러리\n",
        "tavily-python # 언어 모델에 중립적인 디자인으로, 모든 LLM과 통합이 가능하도록 설계된 검색 API\n",
        "beautifulsoup4  #파이썬에서 사용할 수 있는 웹데이터 크롤링 라이브러리\n",
        "wikipedia\n",
        "\n",
        "fastapi #  Python의 API를 빌드하기 위한 웹 프레임워크\n",
        "uvicorn # ASGI(Asynchronous Server Gateway Interface) 서버\n",
        "urllib3 # 파이썬에서 HTTP 요청을 보내고 받는 데 사용되는 강력하고 유연한 라이브러리\n",
        "\n",
        "python-dotenv\n",
        "pypdf"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NMMJXo_JyjhQ"
      },
      "outputs": [],
      "source": [
        "!pip install langchain\n",
        "!pip install langchain-openai\n",
        "!pip install python-dotenv\n",
        "!pip install langchain_community\n",
        "!pip install pypdf\n",
        "!pip install faiss-cpu\n",
        "!pip install wikipedia\n",
        "!pip install openai"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jXEvb3WyJMcA"
      },
      "source": [
        "Tavily Search 를 사용하기 위해서는 API KEY를 발급 받아 등록해야 함.\n",
        "\n",
        "[Tavily Search API 발급받기](https://app.tavily.com/sign-in)\n",
        "\n",
        "발급 받은 API KEY 를 다음과 같이 환경변수에 등록"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RIxxUDEZI6ZR"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "# TAVILY API KEY를 기입합니다.\n",
        "os.environ[\"TAVILY_API_KEY\"] = \"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\"\n",
        "\n",
        "# 디버깅을 위한 프로젝트명을 기입합니다.\n",
        "os.environ[\"LANGCHAIN_PROJECT\"] = \"AGENT TUTORIAL\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ys24Z3bfJHUf"
      },
      "outputs": [],
      "source": [
        "os.environ[\"OPENAI_API_KEY\"] = ''"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sEii2SHNJbAG"
      },
      "outputs": [],
      "source": [
        "# API KEY를 환경변수로 관리하기 위한 설정 파일\n",
        "from dotenv import load_dotenv\n",
        "\n",
        "# API KEY 정보로드\n",
        "load_dotenv()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ezbT1NHQKP12"
      },
      "outputs": [],
      "source": [
        "#google drive load\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rKz2oCpl6lWK"
      },
      "outputs": [],
      "source": [
        "# training_dir_path = '/content/drive/MyDrive/2025_Bigdata_nlp_class/aihub_dataset/Training/02_label_data'\n",
        "# validation_dir_path = '/content/drive/MyDrive/2025_Bigdata_nlp_class/aihub_dataset/Validation/02_label_data'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sWR_5-ANKJt-"
      },
      "source": [
        "search.invoke 함수는 주어진 문자열에 대한 검색을 실행\n",
        "\n",
        "invoke() 함수에 검색하고 싶은 검색어를 넣어 검색을 수행"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5mHSqB3_Kvf3"
      },
      "source": [
        "#PDF 기반 문서 검색 도구: Retriever\n",
        "\n",
        "내부 데이터에 대해 조회를 수행할 retriever 생성.\n",
        "\n",
        "*   웹 기반 문서 로더, 문서 분할기, 벡터 저장소, 그리고 OpenAI 임베딩을 사용하여 문서 검색 시스템을 구축\n",
        "*   PDF 문서를 FAISS DB 에 저장하고 조회하는 retriever 를 생성\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IdP3zsje84fq"
      },
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hnw_piOXK40_"
      },
      "outputs": [],
      "source": [
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "from langchain_community.vectorstores import FAISS\n",
        "from langchain_openai import OpenAIEmbeddings\n",
        "from langchain.document_loaders import PyPDFLoader\n",
        "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
        "import json\n",
        "from langchain.document_loaders import TextLoader\n",
        "from langchain.schema import Document\n",
        "import unicodedata # Import the unicodedata module\n",
        "\n",
        "# PDF 파일 로드. 파일의 경로 입력\n",
        "# 경로 설정\n",
        "training_dir_path = \"/content/drive/MyDrive/2025_Bigdata_nlp_class/aihub_dataset/Training/02_label_data\"\n",
        "\n",
        "# for f in os.listdir(training_dir_path):\n",
        "#     print(repr(f))  # unicode escape로 특수문자 확인\n",
        "\n",
        "documents = []\n",
        "# === 1. JSON 파일 로드 + 파일명 정규화 → 메타데이터 추출 ===\n",
        "def load_documents_with_metadata(folder_path):\n",
        "    for raw_filename in os.listdir(folder_path):\n",
        "        # 파일 시스템에서 읽은 원래 이름을 정규화(NFC) 처리\n",
        "        filename = unicodedata.normalize(\"NFC\", raw_filename)\n",
        "        file_path = os.path.join(folder_path, raw_filename)  # 실제 파일 경로는 raw_filename 써야 합니다.\n",
        "        print(f\"▶ 처리 중 파일 (원본): {filename}\")\n",
        "\n",
        "        # 실제 파일인지 확인 (폴더나 시스템 파일 스킵)\n",
        "        if not os.path.isfile(file_path):\n",
        "            continue\n",
        "        # .json 확장자가 아닌 파일 스킵\n",
        "        if not filename.endswith(\".json\"):\n",
        "            continue\n",
        "\n",
        "        try:\n",
        "            # 정규화된 파일명 출력 (한글 조합형 → 완성형으로 변환됐는지 확인)\n",
        "            print(f\"▶ 처리 중 파일 (정규화): {filename}\")\n",
        "\n",
        "            # 정규화된 파일명을 \"_\"로 분리\n",
        "            # 예시: Empathy_기쁨_부모자녀_조손_343.json\n",
        "            parts = filename.replace(\".json\", \"\").split(\"_\")\n",
        "\n",
        "            # parts[1] = 감정, parts[2] = 관계  (예: \"기쁨\", \"부모자녀\")\n",
        "            if len(parts) >= 3:\n",
        "                emotion = parts[1]\n",
        "                relation = parts[2]\n",
        "            else:\n",
        "                emotion = \"unknown\"\n",
        "                relation = \"unknown\"\n",
        "\n",
        "            # JSON 읽기\n",
        "            with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
        "                data = json.load(f)\n",
        "                utterances = data.get(\"utterances\", [])\n",
        "\n",
        "                # 대화 utterance만 합쳐서 하나의 긴 텍스트로 만듦\n",
        "                full_text = \"\\n\".join([utt.get(\"text\", \"\") for utt in utterances])\n",
        "\n",
        "                # 텍스트가 비어 있으면 스킵\n",
        "                if full_text.strip() == \"\":\n",
        "                    print(f\"  ⚠️ 내용 비어 있음 → 스킵: {filename}\")\n",
        "                    continue\n",
        "\n",
        "                # Document 생성\n",
        "                doc = Document(\n",
        "                    page_content=full_text,\n",
        "                    metadata={\n",
        "                        \"filename\": filename,\n",
        "                        \"emotion\": emotion,\n",
        "                        \"relation\": relation\n",
        "                    }\n",
        "                )\n",
        "                documents.append(doc)\n",
        "\n",
        "        except Exception as e:\n",
        "            print(f\"❌ 오류 발생 ({filename}): {e}\")\n",
        "\n",
        "    return documents\n",
        "\n",
        "\n",
        "documents = load_documents_with_metadata(training_dir_path)\n",
        "print(f\"✅ 로드된 원본 문서 수: {len(documents)}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "d_1DPQehD6o6"
      },
      "outputs": [],
      "source": [
        "# === 2. 문서 분할 함수 ===\n",
        "def split_documents(documents):\n",
        "    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)\n",
        "    return splitter.split_documents(documents)\n",
        "\n",
        "\n",
        "# === 3. FAISS DB 생성 ===\n",
        "def create_faiss_db(docs):\n",
        "    embeddings = OpenAIEmbeddings()\n",
        "    vectorstore = FAISS.from_documents(docs, embeddings)\n",
        "    return vectorstore\n",
        "\n",
        "def filtered_similarity_search(vectorstore, query, emotion=None, relation=None, k=3):\n",
        "    # docstore 내 모든 Document 객체 가져오기\n",
        "    all_docs = list(vectorstore.docstore._dict.values())\n",
        "\n",
        "    # 1차 감정(emotion), 2차 관계(relation) 필터링\n",
        "    filtered_docs = []\n",
        "    for doc in all_docs:\n",
        "        doc_em = doc.metadata.get(\"emotion\", \"\")\n",
        "        doc_rel = doc.metadata.get(\"relation\", \"\")\n",
        "        if emotion and doc_em != emotion:\n",
        "            continue\n",
        "        if relation and doc_rel != relation:\n",
        "            continue\n",
        "        filtered_docs.append(doc)\n",
        "\n",
        "    if not filtered_docs:\n",
        "        print(\"❗ 해당 감정/관계 조건의 문서가 없습니다.\")\n",
        "        return []\n",
        "\n",
        "    # 필터링된 문서를 임베딩해서 별도의 FAISS 인덱스로 만들 수도 있지만,\n",
        "    # 여기서는 간단히 vectorstore.similarity_search() 호출 → 결과에서 필터 적용\n",
        "    # (단, 필요하다면 filtered_docs만으로 새로운 FAISS 인덱스를 생성 후 검색할 수도 있습니다.)\n",
        "    results = vectorstore.similarity_search(query, k=k)\n",
        "\n",
        "    # 검색 결과 중에서도 meta 필터(감정/관계)가 맞는 것만 리턴\n",
        "    return [doc for doc in results\n",
        "            if (not emotion or doc.metadata.get(\"emotion\") == emotion)\n",
        "            and (not relation or doc.metadata.get(\"relation\") == relation)]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_v_45KX6GKQ_"
      },
      "outputs": [],
      "source": [
        "# 2) 문서 분할 (chunking)\n",
        "split_docs = split_documents(documents)\n",
        "print(f\"✅ 분할된 문서 청크 개수: {len(split_docs)}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wAIBhXq4H68s"
      },
      "outputs": [],
      "source": [
        "# 예시: split_docs 리스트에서 앞의 10개 문서만 확인하는 코드\n",
        "\n",
        "# (이전 단계에서 이미 split_docs를 생성했다고 가정)\n",
        "# split_docs = split_documents(documents)\n",
        "\n",
        "# 앞 10개 문서만 출력\n",
        "for idx, doc in enumerate(split_docs[:5], start=1):\n",
        "    print(f\"--- 문서 #{idx} ---\")\n",
        "    print(f\"파일명    : {doc.metadata.get('filename')}\")\n",
        "    print(f\"감정      : {doc.metadata.get('emotion')}\")\n",
        "    print(f\"관계      : {doc.metadata.get('relation')}\")\n",
        "    print(\"내용 (일부) :\")\n",
        "    print(doc.page_content[:200].replace(\"\\n\", \" \") + \"...\\n\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "98_fnjppGVhc"
      },
      "outputs": [],
      "source": [
        "# 3) FAISS DB 생성\n",
        "if not split_docs:\n",
        "  raise RuntimeError(\"❌ 분할된 문서가 없어서 FAISS 생성이 불가능합니다.\")\n",
        "faiss_db = create_faiss_db(split_docs)\n",
        "print(\"✅ FAISS DB 생성 완료\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rEtEXpKTRQ2_"
      },
      "outputs": [],
      "source": [
        "# 저장/로딩할 FAISS 인덱스 폴더 경로\n",
        "index_dir = \"/content/drive/MyDrive/2025_Bigdata_nlp_class/faiss_index\"\n",
        "\n",
        "# --- (1) 이미 저장된 인덱스가 있는지 확인 ---\n",
        "if os.path.isdir(index_dir) and \\\n",
        "   os.path.exists(os.path.join(index_dir, \"index.faiss\")):\n",
        "    # 저장된 인덱스가 있으면 로드\n",
        "    embeddings = OpenAIEmbeddings()\n",
        "    # allow_dangerous_deserialization=True 를 추가하여 로딩을 허용합니다.\n",
        "    faiss_db = FAISS.load_local(index_dir, embeddings, allow_dangerous_deserialization=True)\n",
        "    print(\"✅ 기존 FAISS 인덱스를 불러왔습니다:\", index_dir)\n",
        "\n",
        "else:\n",
        "    # 저장된 인덱스가 없으면 새로 생성\n",
        "    # ① 여기에 split_docs를 미리 생성하는 코드를 넣으세요\n",
        "    #    예시: split_docs = split_documents(documents)\n",
        "    #\n",
        "    # ② create_faiss_db 함수나 직접 임베딩 + 저장 로직을 호출\n",
        "    #\n",
        "    # 예시:\n",
        "    # split_docs = split_documents(documents)\n",
        "    # embeddings = OpenAIEmbeddings()\n",
        "    # faiss_db = FAISS.from_documents(split_docs, embeddings)\n",
        "    #\n",
        "    # 실제 프로젝트에 맞게 아래 두 줄을 수정하세요:\n",
        "    faiss_db = create_faiss_db(split_docs)\n",
        "    embeddings = OpenAIEmbeddings()\n",
        "\n",
        "    # --- (2) 생성된 FAISS를 로컬에 저장 ---\n",
        "    os.makedirs(index_dir, exist_ok=True)\n",
        "    faiss_db.save_local(index_dir)\n",
        "    print(\"✅ 새로 FAISS 인덱스를 생성하고 저장했습니다:\", index_dir)\n",
        "\n",
        "# 이후 faiss_db를 retriever로 사용 가능합니다.\n",
        "# 예시:\n",
        "retriever = faiss_db.as_retriever()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0w4Xzp6FJzD1"
      },
      "outputs": [],
      "source": [
        "# === 4. 사용자 검색 함수 (감정 + 관계 필터) ===\n",
        "def filtered_similarity_search(vectorstore, query, emotion=None, relation=None, k=3):\n",
        "    # 필터링\n",
        "    all_docs = vectorstore.docstore._dict.values()\n",
        "    filtered_docs = [\n",
        "        doc for doc in all_docs\n",
        "        if (emotion is None or doc.metadata.get(\"emotion\") == emotion)\n",
        "        and (relation is None or relation in doc.metadata.get(\"relation\"))\n",
        "    ]\n",
        "\n",
        "    if not filtered_docs:\n",
        "        print(\"❗해당 조건의 문서가 없습니다.\")\n",
        "        return []\n",
        "\n",
        "    # 유사도 기반 검색\n",
        "    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
        "    query_chunks = splitter.split_text(query)\n",
        "    search_results = []\n",
        "    for chunk in query_chunks:\n",
        "        search_results.extend(vectorstore.similarity_search(chunk, k=k))\n",
        "    return search_results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wE5IY8l5KH0T"
      },
      "outputs": [],
      "source": [
        "# === 예시 검색 ===\n",
        "query = \"아기를 키우는 게 너무 힘들어요. 조언이 필요해요.\"\n",
        "results = filtered_similarity_search(faiss_db, query, emotion=\"기쁨\", relation=\"부모자녀\")\n",
        "\n",
        "for i, doc in enumerate(results):\n",
        "    print(f\"\\n✅ 검색 결과 {i+1}\")\n",
        "    print(f\"파일명: {doc.metadata['filename']}\")\n",
        "    print(f\"감정: {doc.metadata['emotion']} / 관계: {doc.metadata['relation']}\")\n",
        "    print(doc.page_content[:300] + \"...\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ay3lQq-RK_AE"
      },
      "outputs": [],
      "source": [
        "import openai\n",
        "\n",
        "# OpenAI API 키 설정\n",
        "class config:\n",
        "    OPENAI_API_KEY = \"\"\n",
        "\n",
        "openai.api_key = config.OPENAI_API_KEY\n",
        "\n",
        "# ─── 2) GPT-4o(또는 GPT-4o-mini)를 사용해 “가장 적절한” 문서를 고르는 함수 ───────────────────\n",
        "def choose_best_doc_with_gpt(query, docs, model=\"gpt-4o-mini\"):\n",
        "    \"\"\"\n",
        "    query: 사용자의 원래 질문\n",
        "    docs: filtered_similarity_search에서 반환된 Document 객체 리스트\n",
        "    model: \"gpt-4o\" 또는 \"gpt-4o-mini\"\n",
        "    반환: (best_doc, gpt_explanation)\n",
        "    \"\"\"\n",
        "    # (A) 프롬프트 구성\n",
        "    prompt_parts = [\n",
        "        \"당신은 대화 응답 후보를 평가하는 전문가입니다.\\n\",\n",
        "        f\"사용자 질문: \\\"{query}\\\"\\n\",\n",
        "        \"다음은 검색된 응답 후보들입니다.\\n\"\n",
        "    ]\n",
        "\n",
        "    for idx, doc in enumerate(docs, start=1):\n",
        "        snippet = doc.page_content.strip().replace(\"\\n\", \" \")\n",
        "        if len(snippet) > 300:\n",
        "            snippet = snippet[:300] + \"...\"\n",
        "        prompt_parts.append(\n",
        "            f\"[{idx}]\\n\"\n",
        "            f\"Filename: {doc.metadata.get('filename')}\\n\"\n",
        "            f\"Emotion: {doc.metadata.get('emotion')}, Relation: {doc.metadata.get('relation')}\\n\"\n",
        "            f\"Content: \\\"{snippet}\\\"\\n\"\n",
        "        )\n",
        "\n",
        "    prompt_parts.append(\n",
        "        \"\\n위 후보들 중에서, 사용자 질문에 가장 적절한 응답을 하나 선택하고, 그 이유를 간단히 설명해주세요.\\n\"\n",
        "        \"반드시 다음 형식으로 응답해야 합니다:\\n\"\n",
        "        \"선택: [번호]\\n\"\n",
        "        \"이유: [이유]\\n\"\n",
        "    )\n",
        "\n",
        "    full_prompt = \"\\n\".join(prompt_parts)\n",
        "\n",
        "    # (B) GPT-4o 호출\n",
        "    response = openai.chat.completions.create(\n",
        "        model=model,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"당신은 뛰어난 대화 평가자입니다.\"},\n",
        "            {\"role\": \"user\", \"content\": full_prompt}\n",
        "        ],\n",
        "        max_tokens=300,\n",
        "        temperature=0.0\n",
        "    )\n",
        "\n",
        "    gpt_reply = response.choices[0].message.content.strip()\n",
        "\n",
        "    # (C) GPT가 반환한 '선택: [번호]' 파싱\n",
        "    selected_idx = None\n",
        "    for line in gpt_reply.splitlines():\n",
        "        if line.strip().startswith(\"선택\"):\n",
        "            import re\n",
        "            m = re.search(r\"\\[(\\d+)\\]\", line)\n",
        "            if m:\n",
        "                selected_idx = int(m.group(1))\n",
        "                break\n",
        "\n",
        "    # 파싱 실패 시 기본 1번 선택\n",
        "    if selected_idx is None or selected_idx < 1 or selected_idx > len(docs):\n",
        "        selected_idx = 1\n",
        "\n",
        "    best_doc = docs[selected_idx - 1]\n",
        "    return best_doc, gpt_reply\n",
        "\n",
        "\n",
        "# ─── 3) 예시 검색 + GPT-4o 최종 선택 ───────────────────────────────────────────────\n",
        "if __name__ == \"__main__\":\n",
        "    # (가정) faiss_db는 이미 생성되어 로드된 FAISS 인덱스 객체입니다.\n",
        "    # 예: faiss_db = FAISS.load_local(index_dir, OpenAIEmbeddings())\n",
        "\n",
        "    # ① 조회할 사용자 질의\n",
        "    query = \"아기를 키우는 게 너무 힘들어요. 조언이 필요해요.\"\n",
        "\n",
        "    # ② 기존 검색 함수 그대로 사용\n",
        "    results = filtered_similarity_search(faiss_db, query, emotion=\"기쁨\", relation=\"부모자녀\")\n",
        "\n",
        "    # ③ 검색 결과 출력\n",
        "    for i, doc in enumerate(results, start=1):\n",
        "        print(f\"\\n✅ 검색 결과 {i}\")\n",
        "        print(f\"파일명: {doc.metadata['filename']}\")\n",
        "        print(f\"감정: {doc.metadata['emotion']} / 관계: {doc.metadata['relation']}\")\n",
        "        print(doc.page_content[:300] + \"...\\n\")\n",
        "\n",
        "    # ④ 검색 결과가 있으면, GPT-4o로 \"가장 적절한\" 문서 선택\n",
        "    if results:\n",
        "        best_doc, explanation = choose_best_doc_with_gpt(query, results, model=\"gpt-4o-mini\")\n",
        "\n",
        "        print(\"\\n\\n=== GPT-4o가 선택한 최종 응답 ===\")\n",
        "        print(f\"■ 선택된 파일명: {best_doc.metadata['filename']}\")\n",
        "        print(f\"■ 선택 이유:\\n{explanation}\\n\")\n",
        "        print(f\"■ 최종 응답 내용:\\n{best_doc.page_content}\")\n",
        "    else:\n",
        "        print(\"❗ 검색 결과가 없어 GPT 평가를 진행할 수 없습니다.\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7hNWJkaTSOC9"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import openai\n",
        "import unicodedata\n",
        "from langchain.schema import Document\n",
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "from langchain.vectorstores import FAISS\n",
        "\n",
        "# ─── 0) OpenAI API 키 설정 ─────────────────────────────────────────────────\n",
        "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
        "\n",
        "\n",
        "# ─── 1) 사용자 검색 함수 그대로 ────────────────────────────────────────────────\n",
        "def filtered_similarity_search(vectorstore, query, emotion=None, relation=None, k=3):\n",
        "    all_docs = vectorstore.docstore._dict.values()\n",
        "    filtered_docs = [\n",
        "        doc for doc in all_docs\n",
        "        if (emotion is None or doc.metadata.get(\"emotion\") == emotion)\n",
        "        and (relation is None or relation in doc.metadata.get(\"relation\"))\n",
        "    ]\n",
        "\n",
        "    if not filtered_docs:\n",
        "        print(\"❗ 해당 조건의 문서가 없습니다.\")\n",
        "        return []\n",
        "\n",
        "    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
        "    query_chunks = splitter.split_text(query)\n",
        "\n",
        "    search_results = []\n",
        "    for chunk in query_chunks:\n",
        "        search_results.extend(vectorstore.similarity_search(chunk, k=k))\n",
        "    return search_results\n",
        "\n",
        "\n",
        "# ─── 2) GPT-4o로 “최적 후보 선택 + 이유 설명” 함수 (v1 API) ───────────────────────────\n",
        "def choose_best_doc_with_gpt(query, docs, model=\"gpt-4o-mini\"):\n",
        "    \"\"\"\n",
        "    query: 사용자의 원래 질문\n",
        "    docs: filtered_similarity_search에서 반환된 Document 리스트\n",
        "    model: \"gpt-4o\" 또는 \"gpt-4o-mini\"\n",
        "    반환: (best_doc, gpt_reason)\n",
        "      - best_doc: GPT가 선택한 Document 객체\n",
        "      - gpt_reason: \"선택: [번호]\\n이유: ...\" 형태의 문자열\n",
        "    \"\"\"\n",
        "    prompt_parts = [\n",
        "        \"당신은 후보 응답을 평가하는 전문가입니다.\\n\",\n",
        "        f\"사용자 질문: \\\"{query}\\\"\\n\",\n",
        "        \"다음은 검색된 응답 후보들입니다.\\n\"\n",
        "    ]\n",
        "\n",
        "    for idx, doc in enumerate(docs, start=1):\n",
        "        snippet = doc.page_content.strip().replace(\"\\n\", \" \")\n",
        "        if len(snippet) > 300:\n",
        "            snippet = snippet[:300] + \"...\"\n",
        "        prompt_parts.append(\n",
        "            f\"[{idx}]\\n\"\n",
        "            f\"Filename: {doc.metadata.get('filename')}\\n\"\n",
        "            f\"Emotion: {doc.metadata.get('emotion')}, Relation: {doc.metadata.get('relation')}\\n\"\n",
        "            f\"Content: \\\"{snippet}\\\"\\n\"\n",
        "        )\n",
        "\n",
        "    prompt_parts.append(\n",
        "        \"\\n위 후보들 중에서, 사용자 질문에 가장 적절한 응답을 한 개만 선택하고, 그 이유를 간단히 설명해주세요.\\n\"\n",
        "        \"반드시 다음 형식으로 응답해 주세요:\\n\"\n",
        "        \"선택: [번호]\\n\"\n",
        "        \"이유: [간단한 설명]\\n\"\n",
        "    )\n",
        "\n",
        "    full_prompt = \"\\n\".join(prompt_parts)\n",
        "\n",
        "    # ── OpenAI Chat Completions 호출 ─────────────────────────────────────────────\n",
        "    response = openai.chat.completions.create(\n",
        "        model=model,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"당신은 뛰어난 대화 평가자입니다.\"},\n",
        "            {\"role\": \"user\", \"content\": full_prompt}\n",
        "        ],\n",
        "        max_tokens=300,\n",
        "        temperature=0.0\n",
        "    )\n",
        "\n",
        "    gpt_reply = response.choices[0].message.content.strip()\n",
        "\n",
        "    # “선택: [번호]” 파싱\n",
        "    selected_idx = None\n",
        "    for line in gpt_reply.splitlines():\n",
        "        if line.strip().startswith(\"선택\"):\n",
        "            import re\n",
        "            m = re.search(r\"\\[(\\d+)\\]\", line)\n",
        "            if m:\n",
        "                selected_idx = int(m.group(1))\n",
        "                break\n",
        "\n",
        "    # 파싱 실패 시 기본 1번\n",
        "    if selected_idx is None or selected_idx < 1 or selected_idx > len(docs):\n",
        "        selected_idx = 1\n",
        "\n",
        "    best_doc = docs[selected_idx - 1]\n",
        "    return best_doc, gpt_reply\n",
        "\n",
        "\n",
        "# ─── 3) “선택된 후보를 간결하게 재작성” 함수 ─────────────────────────────────────\n",
        "def generate_final_answer(query, best_doc, model=\"gpt-4o-mini\"):\n",
        "    \"\"\"\n",
        "    query: 사용자의 원래 질문\n",
        "    best_doc: choose_best_doc_with_gpt가 반환한 Document 객체\n",
        "    model: \"gpt-4o\" 또는 \"gpt-4o-mini\"\n",
        "    반환: GPT가 생성한 최종 사용자용 응답(불필요한 부분 제거된 형태)\n",
        "    \"\"\"\n",
        "    # (A) 프롬프트 구성: “최종 응답 후보”를 직접 재작성하도록 요청\n",
        "    prompt = (\n",
        "        \"다음은 사용자의 질문과, 선택된 최적 응답 후보입니다.\\n\\n\"\n",
        "        f\"사용자 질문: \\\"{query}\\\"\\n\"\n",
        "        \"선택된 후보 응답 내용(원문):\\n\"\n",
        "        f\"\\\"\\\"\\\"\\n{best_doc.page_content}\\n\\\"\\\"\\\"\\n\\n\"\n",
        "        \"위 원문에서, 불필요한 반복/인사말/개인정보 등은 모두 제거하고, \"\n",
        "        \"사용자가 이해하기 쉽도록 핵심만 남겨 간결하게 재작성해주세요.\\n\"\n",
        "        \"문체는 친절하고 공감 가득한 톤을 유지해 주시고, \"\n",
        "        \"최종 답변만 출력해 주세요.\"\n",
        "    )\n",
        "\n",
        "    # (B) GPT-4o 호출\n",
        "    response = openai.chat.completions.create(\n",
        "        model=model,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"당신은 친절하고 공감능력이 뛰어난 상담사입니다.\"},\n",
        "            {\"role\": \"user\", \"content\": prompt}\n",
        "        ],\n",
        "        max_tokens=300,\n",
        "        temperature=0.7\n",
        "    )\n",
        "\n",
        "    final_answer = response.choices[0].message.content.strip()\n",
        "    return final_answer\n",
        "\n",
        "\n",
        "# ─── 4) 전체 흐름 예시 ────────────────────────────────────────────────────────\n",
        "if __name__ == \"__main__\":\n",
        "    # (가정) faiss_db는 이미 생성/로드된 FAISS 인덱스 객체입니다.\n",
        "    # 예: faiss_db = FAISS.load_local(index_dir, OpenAIEmbeddings())\n",
        "\n",
        "    # ① 사용자 질의\n",
        "    query = \"아기를 키우는 게 너무 힘들어요. 조언이 필요해요.\"\n",
        "\n",
        "    # ② 기존 검색 함수 그대로 사용\n",
        "    results = filtered_similarity_search(faiss_db, query, emotion=\"기쁨\", relation=\"부모자녀\")\n",
        "\n",
        "    # ③ 검색 결과 출력 (원본 후보 3개)\n",
        "    for i, doc in enumerate(results, start=1):\n",
        "        print(f\"\\n✅ 검색 결과 {i}\")\n",
        "        print(f\"파일명: {doc.metadata['filename']}\")\n",
        "        print(f\"감정: {doc.metadata['emotion']} / 관계: {doc.metadata['relation']}\")\n",
        "        print(doc.page_content[:300] + \"...\\n\")\n",
        "\n",
        "    # ④ 검색 결과가 있으면, GPT-4o로 “가장 적절한” 문서 선택 + 이유 얻기\n",
        "    if results:\n",
        "        best_doc, explanation = choose_best_doc_with_gpt(query, results, model=\"gpt-4o-mini\")\n",
        "        print(\"\\n\\n=== GPT-4o가 선택한 최종 응답 후보 ===\")\n",
        "        print(f\"■ 선택된 파일명: {best_doc.metadata['filename']}\")\n",
        "        print(f\"■ 선택 이유:\\n{explanation}\\n\")\n",
        "\n",
        "        # ⑤ 선택된 후보를 재작성하여 최종 답변 생성\n",
        "        cleaned_answer = generate_final_answer(query, best_doc, model=\"gpt-4o-mini\")\n",
        "        print(\"=== 최종 사용자 응답 (불필요한 내용 제거됨) ===\")\n",
        "        print(cleaned_answer)\n",
        "\n",
        "    else:\n",
        "        print(\"❗ 검색 결과가 없어 GPT 평가를 진행할 수 없습니다.\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Pk55HkBQSW_s"
      },
      "outputs": [],
      "source": [
        "# ─── 예시: 여러 개의 query를 한 번에 처리해 보는 코드 ─────────────────────\n",
        "\n",
        "# (가정) 이미 아래 함수들이 정의되어 있고, faiss_db도 생성/로드되어 있다고 봅니다.\n",
        "# - filtered_similarity_search(vectorstore, query, emotion, relation)\n",
        "# - choose_best_doc_with_gpt(query, docs, model)\n",
        "# - generate_final_answer(query, best_doc, model)\n",
        "\n",
        "# 1) 테스트할 질의 리스트를 정의\n",
        "queries = [\n",
        "    \"아기를 키우는 일을 시작하려는데, 어떻게 준비해야 할까요?\",\n",
        "    \"아이가 자꾸 밤에 깨서 낮잠도 잘 못 자요. 어떻게 도와줄 수 있을까요?\",\n",
        "    \"육아 스트레스를 푸는 방법이 있을까요?\",\n",
        "    \"첫 돌 지난 아기가 말을 잘 안 들을 때 어떻게 해야 하나요?\",\n",
        "    \"아기가 갑자기 울음을 멈추지 않아서 당황스러워요. 조언 부탁드려요.\"\n",
        "]\n",
        "\n",
        "# 2) 감정(emotion)과 관계(relation) 예시는 고정해도 되고,\n",
        "#    아니면 query별로 달리 지정해도 됩니다. 여기서는 예시로 전부 \"기쁨\"/\"부모자녀\"로 가정.\n",
        "emotion = \"기쁨\"\n",
        "relation = \"부모자녀\"\n",
        "\n",
        "# 3) 각 query 순회하면서 단계별로 결과 출력\n",
        "for idx, q in enumerate(queries, start=1):\n",
        "    print(f\"\\n\\n========== Query #{idx} ==========\")\n",
        "    print(f\"사용자 질문: {q}\\n\")\n",
        "\n",
        "    # 3-1) 감정/관계 필터 + FAISS 유사도 검색 → 후보 3개 가져오기\n",
        "    candidates = filtered_similarity_search(faiss_db, q, emotion=emotion, relation=relation)\n",
        "    if not candidates:\n",
        "        print(\"❗ 조건에 맞는 문서가 없습니다. 다음 질의로 넘어갑니다.\")\n",
        "        continue\n",
        "\n",
        "    # 3-2) 후보 원문 간단 출력\n",
        "    print(\"■ 검색된 후보 (최대 3개):\")\n",
        "    for i, doc in enumerate(candidates, start=1):\n",
        "        print(f\"\\n[후보 {i}] 파일명: {doc.metadata['filename']}\")\n",
        "        print(f\"감정: {doc.metadata['emotion']}, 관계: {doc.metadata['relation']}\")\n",
        "        print(doc.page_content[:200].replace(\"\\n\", \" \") + \"...\\n\")\n",
        "\n",
        "    # 3-3) GPT-4o에게 “가장 적절한 후보 선택 + 이유” 요청\n",
        "    best_doc, choice_reason = choose_best_doc_with_gpt(q, candidates, model=\"gpt-4o-mini\")\n",
        "    print(\"\\n■ GPT-4o가 선택한 후보:\")\n",
        "    print(f\"  • 선택된 파일명: {best_doc.metadata['filename']}\")\n",
        "    print(f\"  • 선택 이유:\\n{choice_reason}\\n\")\n",
        "\n",
        "    # 3-4) 선택된 후보를 다듬어서 최종 답변 생성\n",
        "    final_answer = generate_final_answer(q, best_doc, model=\"gpt-4o-mini\")\n",
        "    print(\"■ 최종 사용자 응답 (정제된 텍스트):\")\n",
        "    print(final_answer)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UJAUt5feYme_"
      },
      "outputs": [],
      "source": [
        "!pip install gradio"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DDPtd5xgYxfn"
      },
      "outputs": [],
      "source": [
        "import gradio as gr\n",
        "import os\n",
        "import openai\n",
        "import unicodedata\n",
        "import json\n",
        "from langchain.schema import Document\n",
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "from langchain.vectorstores import FAISS\n",
        "\n",
        "# ─── 0) OpenAI API 키 설정 ─────────────────────────────────────────────────\n",
        "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
        "\n",
        "# ─── 1) JSON 로드 + 메타데이터 추출 함수 ─────────────────────────────────────────\n",
        "def load_documents_with_metadata(folder_path):\n",
        "    documents = []\n",
        "    for raw_filename in os.listdir(folder_path):\n",
        "        filename = unicodedata.normalize(\"NFC\", raw_filename)\n",
        "        file_path = os.path.join(folder_path, raw_filename)\n",
        "\n",
        "        if not os.path.isfile(file_path):\n",
        "            continue\n",
        "        if not filename.endswith(\".json\"):\n",
        "            continue\n",
        "\n",
        "        try:\n",
        "            parts = filename.replace(\".json\", \"\").split(\"_\")\n",
        "            emotion = parts[1] if len(parts) > 1 else \"unknown\"\n",
        "            relation = parts[2] if len(parts) > 2 else \"unknown\"\n",
        "\n",
        "            with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
        "                data = json.load(f)\n",
        "                utterances = data.get(\"utterances\", [])\n",
        "                full_text = \"\\n\".join([utt.get(\"text\",\"\") for utt in utterances])\n",
        "                if full_text.strip() == \"\":\n",
        "                    continue\n",
        "\n",
        "                doc = Document(\n",
        "                    page_content=full_text,\n",
        "                    metadata={\"filename\": filename, \"emotion\": emotion, \"relation\": relation}\n",
        "                )\n",
        "                documents.append(doc)\n",
        "        except Exception as e:\n",
        "            print(f\"❌ 오류 발생 ({filename}): {e}\")\n",
        "\n",
        "    return documents\n",
        "\n",
        "# ─── 2) 문서 분할 함수 ─────────────────────────────────────────────────────\n",
        "def split_documents(documents):\n",
        "    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)\n",
        "    return splitter.split_documents(documents)\n",
        "\n",
        "# ─── 3) FAISS 인덱스 생성 혹은 로드 함수 ───────────────────────────────────\n",
        "def create_or_load_faiss(index_dir, split_docs):\n",
        "    embeddings = OpenAIEmbeddings()\n",
        "    if os.path.isdir(index_dir) and os.path.exists(os.path.join(index_dir, \"index.faiss\")):\n",
        "        faiss_db = FAISS.load_local(index_dir, embeddings, allow_dangerous_deserialization=True)\n",
        "        print(\"✅ 기존 FAISS 인덱스를 로드했습니다.\")\n",
        "    else:\n",
        "        faiss_db = FAISS.from_documents(split_docs, embeddings)\n",
        "        os.makedirs(index_dir, exist_ok=True)\n",
        "        faiss_db.save_local(index_dir)\n",
        "        print(\"✅ 새로운 FAISS 인덱스를 생성하고 저장했습니다.\")\n",
        "    return faiss_db\n",
        "\n",
        "# ─── 4) 필터 + 유사도 검색 함수 ────────────────────────────────────────────────\n",
        "def filtered_similarity_search(vectorstore, query, emotion=None, relation=None, k=3):\n",
        "    all_docs = vectorstore.docstore._dict.values()\n",
        "    filtered_docs = [\n",
        "        doc for doc in all_docs\n",
        "        if (emotion is None or doc.metadata.get(\"emotion\") == emotion)\n",
        "        and (relation is None or relation in doc.metadata.get(\"relation\"))\n",
        "    ]\n",
        "\n",
        "    if not filtered_docs:\n",
        "        return []\n",
        "\n",
        "    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
        "    query_chunks = splitter.split_text(query)\n",
        "\n",
        "    search_results = []\n",
        "    for chunk in query_chunks:\n",
        "        search_results.extend(vectorstore.similarity_search(chunk, k=k))\n",
        "    return search_results\n",
        "\n",
        "# ─── 5) 후보 중 최고 문서 선택 함수 ─────────────────────────────────────────────\n",
        "def choose_best_doc_with_gpt(query, docs, model=\"gpt-4o-mini\"):\n",
        "    prompt_parts = [\n",
        "        \"당신은 대화 응답 후보를 평가하는 전문가입니다.\\n\",\n",
        "        f\"사용자 질문: \\\"{query}\\\"\\n\",\n",
        "        \"다음은 검색된 응답 후보들입니다.\\n\"\n",
        "    ]\n",
        "\n",
        "    for idx, doc in enumerate(docs, start=1):\n",
        "        snippet = doc.page_content.strip().replace(\"\\n\", \" \")\n",
        "        if len(snippet) > 300:\n",
        "            snippet = snippet[:300] + \"...\"\n",
        "        prompt_parts.append(\n",
        "            f\"[{idx}]\\n\"\n",
        "            f\"Filename: {doc.metadata.get('filename')}\\n\"\n",
        "            f\"Emotion: {doc.metadata.get('emotion')}, Relation: {doc.metadata.get('relation')}\\n\"\n",
        "            f\"Content: \\\"{snippet}\\\"\\n\"\n",
        "        )\n",
        "\n",
        "    prompt_parts.append(\n",
        "        \"\\n위 후보들 중에서, 사용자 질문에 가장 적절한 응답을 하나 선택하고, 그 이유를 간단히 설명해주세요.\\n\"\n",
        "        \"반드시 다음 형식으로 응답해 주세요:\\n\"\n",
        "        \"선택: [번호]\\n\"\n",
        "        \"이유: [간단한 설명]\\n\"\n",
        "    )\n",
        "\n",
        "    full_prompt = \"\\n\".join(prompt_parts)\n",
        "\n",
        "    response = openai.chat.completions.create(\n",
        "        model=model,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"당신은 뛰어난 대화 평가자입니다.\"},\n",
        "            {\"role\": \"user\", \"content\": full_prompt}\n",
        "        ],\n",
        "        max_tokens=300,\n",
        "        temperature=0.0\n",
        "    )\n",
        "\n",
        "    gpt_reply = response.choices[0].message.content.strip()\n",
        "    selected_idx = None\n",
        "    for line in gpt_reply.splitlines():\n",
        "        if line.strip().startswith(\"선택\"):\n",
        "            import re\n",
        "            m = re.search(r\"\\[(\\d+)\\]\", line)\n",
        "            if m:\n",
        "                selected_idx = int(m.group(1))\n",
        "                break\n",
        "\n",
        "    if selected_idx is None or selected_idx < 1 or selected_idx > len(docs):\n",
        "        selected_idx = 1\n",
        "\n",
        "    best_doc = docs[selected_idx - 1]\n",
        "    return best_doc, gpt_reply\n",
        "\n",
        "# ─── 6) 최종 답변 간결하게 생성 함수 ─────────────────────────────────────────────\n",
        "def generate_final_answer(query, best_doc, model=\"gpt-4o-mini\"):\n",
        "    prompt = (\n",
        "        \"다음은 사용자의 질문과, 선택된 최적 응답 후보입니다.\\n\\n\"\n",
        "        f\"사용자 질문: \\\"{query}\\\"\\n\"\n",
        "        \"선택된 후보 응답 내용(원문):\\n\"\n",
        "        f\"\\\"\\\"\\\"\\n{best_doc.page_content}\\n\\\"\\\"\\\"\\n\\n\"\n",
        "        \"위 원문에서, 불필요한 반복/인사말/개인정보 등은 모두 제거하고, \"\n",
        "        \"사용자가 이해하기 쉽도록 핵심만 남겨 간결하게 재작성해주세요.\\n\"\n",
        "        \"문체는 친절하고 공감 가득한 톤을 유지해 주시고, \"\n",
        "        \"최종 답변만 출력해 주세요.\"\n",
        "    )\n",
        "\n",
        "    response = openai.chat.completions.create(\n",
        "        model=model,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"당신은 친절하고 공감능력이 뛰어난 상담사입니다.\"},\n",
        "            {\"role\": \"user\", \"content\": prompt}\n",
        "        ],\n",
        "        max_tokens=300,\n",
        "        temperature=0.7\n",
        "    )\n",
        "\n",
        "    final_answer = response.choices[0].message.content.strip()\n",
        "    return final_answer\n",
        "\n",
        "# ─── 7) Gradio 응용: 채팅 인터페이스 구축 ───────────────────────────────────────\n",
        "index_dir = \"/content/drive/MyDrive/2025_Bigdata_nlp_class/faiss_index\"\n",
        "folder_path = \"/content/drive/MyDrive/2025_Bigdata_nlp_class/aihub_dataset/Training/02_label_data\"\n",
        "\n",
        "# 문서 로드 및 FAISS 초기화\n",
        "documents = load_documents_with_metadata(folder_path)\n",
        "split_docs = split_documents(documents)\n",
        "faiss_db = create_or_load_faiss(index_dir, split_docs)\n",
        "\n",
        "def chat_response(query, emotion, relation):\n",
        "    candidates = filtered_similarity_search(faiss_db, query, emotion, relation)\n",
        "    if not candidates:\n",
        "        return \"조건에 맞는 문서가 없습니다.\"\n",
        "\n",
        "    best_doc, _ = choose_best_doc_with_gpt(query, candidates, model=\"gpt-4o-mini\")\n",
        "    final_answer = generate_final_answer(query, best_doc, model=\"gpt-4o-mini\")\n",
        "    return final_answer\n",
        "\n",
        "with gr.Blocks() as demo:\n",
        "    gr.Markdown(\"## 감정/관계 기반 Empathy QA 시스템\")\n",
        "    with gr.Row():\n",
        "        txt_query = gr.Textbox(label=\"질문\", placeholder=\"질문을 입력하세요...\", lines=2)\n",
        "    with gr.Row():\n",
        "        txt_emotion = gr.Textbox(label=\"Emotion (예: 기쁨, 당황, 분노)\", placeholder=\"ex) 기쁨\")\n",
        "        txt_relation = gr.Textbox(label=\"Relation (예: 부모자녀, 부부, 연인)\", placeholder=\"ex) 부모자녀\")\n",
        "    btn_submit = gr.Button(\"전송\")\n",
        "    output = gr.Textbox(label=\"답변\", lines=5)\n",
        "\n",
        "    btn_submit.click(chat_response, inputs=[txt_query, txt_emotion, txt_relation], outputs=output)\n",
        "\n",
        "demo.launch()\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "private_outputs": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}