import gradio as gr
import pickle
import os
from datasets import load_dataset
from gradio.components import Label
from InstructorEmbedding import INSTRUCTOR
import heapq
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

dataset = load_dataset("SandipPalit/Movie_Dataset")

model = INSTRUCTOR('hkunlp/instructor-xl')

def getSimilarity(sentences_a,sentences_b):
  embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
  embeddings_b = model.encode(sentences_b)
  similarities = cosine_similarity(embeddings_a,embeddings_b)
  return similarities
  

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(idx,text,total_length):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    padding=''+'0'*(len(str(total_length))-len(str(idx)))
    output=[]
    for sentence in sentences:
      output.append(' '.join([lemmatizer.lemmatize(word) for word in sentence.split() if word not in stop_words])+'@'+padding+str(idx))
    return output

def get_pre_processed_data(size):
  sentences=[]
  for idx,x in enumerate(df['Plot'].head(size).tolist()):
     sentences.extend(preprocess(idx,x,df.shape[0]))
  return sentences

#building_the_max_heap
def heapsort(np_array,k):
  h=[] 
  for idx,score in enumerate(np_array):
    heapq.heappush(h,(-score,idx))                    #max_heap
  return h


#return the id's of the movie
def get_top_k_matches(np_array,k,sentences):
   indices=set()
   h=heapsort(np_array,k)

   visited=set()
   indices=[]
   while h and len(indices)!=k:
     score,idx=heapq.heappop(h)
     i=len(sentences[idx])-1                  #based on the index find the sentence- reason for storing idx but not sentence
     count=1
     number=0
     while sentences[idx][i]!='@':                     #O(8-10 digits) i.e O(1) time
       number=number+count*int(sentences[idx][i])
       count*=10
       i-=1
     
     if number not in visited:                #duplicate ids are not added, mainting 2 arrays is to maintian the order
       indices.append(number)
       visited.add(number)
   return indices
    

df=pd.DataFrame({"Title":dataset['train']['Title'],"Plot":dataset['train']['Overview']})

def getOutput(text, size=1000):
    sentences=get_pre_processed_data(int(size))
    np_array=getSimilarity(sentences,[text])

    output=[]
    for idx in get_top_k_matches(np_array,5,sentences):
        output.append("title = "+df.iloc[idx]['Title']+" "*5+" Plot = "+df.iloc[idx]['Plot'])
    return output
iface = gr.Interface(fn=getOutput, 
                     inputs=[gr.inputs.Textbox(label="Text")], 
                     outputs=[Label() for i in range(5)],
                     examples=[['After doing the list of experiments A mad scientist declares himself as the god'],["Three men fight for the girl's love"]]
                    )
iface.launch(debug=True)