Rakesh30 commited on
Commit
8d3df09
·
1 Parent(s): 2e5eee1

Upload 2 files

Browse files
Files changed (2) hide show
  1. movie.py +82 -0
  2. requirements.txt +4 -0
movie.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ dataset = load_dataset("SandipPalit/Movie_Dataset")
4
+
5
+ from InstructorEmbedding import INSTRUCTOR
6
+ model = INSTRUCTOR('hkunlp/instructor-xl')
7
+
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ def getSimilarity(sentences_a,sentences_b):
11
+ embeddings_a = model.encode(sentences_a)
12
+ embeddings_b = model.encode(sentences_b)
13
+ similarities = cosine_similarity(embeddings_a,embeddings_b)
14
+ return similarities
15
+
16
+ #get the indices of the np_array that has maximum score
17
+ import heapq
18
+ def get_top_k(h,k):
19
+ output=[]
20
+ for i in range(k):
21
+ output.append(heapq.heappop(h)[1])
22
+ return output
23
+
24
+ def heapsort(np_array,k):
25
+ h=[]
26
+ for idx,score in enumerate(np_array):
27
+ heapq.heappush(h,(-score,idx)) #max_heap
28
+ return get_top_k(h,k)
29
+
30
+
31
+ import nltk
32
+ from nltk.corpus import stopwords
33
+ from nltk.tokenize import word_tokenize, sent_tokenize
34
+ from nltk.stem import WordNetLemmatizer
35
+
36
+ nltk.download('punkt')
37
+ nltk.download('stopwords')
38
+ nltk.download('wordnet')
39
+
40
+ def preprocess(idx,text,total_length):
41
+ sentences = sent_tokenize(text)
42
+ stop_words = set(stopwords.words('english'))
43
+ lemmatizer = WordNetLemmatizer()
44
+
45
+ padding=''+'0'*(len(str(total_length))-len(str(idx)))
46
+ output=[]
47
+ for sentence in sentences:
48
+ output.append(' '.join([lemmatizer.lemmatize(word) for word in sentence.split() if word not in stop_words])+'@'+padding+str(idx))
49
+ return output
50
+
51
+ def get_pre_processed_data(size):
52
+ sentences=[]
53
+ for idx,x in enumerate(df['Plot'].head(size).tolist()):
54
+ sentences.extend(preprocess(idx,x,df.shape[0]))
55
+ return sentences
56
+
57
+ def get_top_k_matches(np_array,k,sentences):
58
+ indices=[]
59
+ for idx in heapsort(np_array,k):
60
+ i=len(sentences[idx])-1 #based on the index find the sentence- reason for storing idx but not sentence
61
+ count=1
62
+ number=0
63
+ while sentences[idx][i]!='@': #o(8-10 digits)- o(1)
64
+ number=number+count*int(sentences[idx][i])
65
+ count*=10
66
+ i-=1
67
+ indices.append(number)
68
+ #print(indices)
69
+ return indices
70
+
71
+
72
+ import pandas as pd
73
+ df=pd.DataFrame({"Title":dataset['train']['Title'],"Plot":dataset['train']['Overview']})
74
+
75
+ def getOutput(text, size):
76
+ sentences=get_pre_processed_data(int(size))
77
+ np_array=getSimilarity(sentences,[text])
78
+ s=df.iloc[get_top_k_matches(np_array,2,sentences)]['Title']
79
+ return f"{s}."
80
+
81
+ iface = gr.Interface(fn=getOutput, inputs=[gr.inputs.Textbox(label="Text"), gr.inputs.Number(label="size")], outputs="text")
82
+ iface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ datasets
2
+ sentence_transformers
3
+ InstructorEmbedding
4
+