| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.preprocessing import StandardScaler | |
| from scipy.sparse import load_npz | |
| import time | |
| from multiprocessing import Pool | |
| embed_type = 'SBERT' # Change this to 'MLFPA' or 'BERT' as needed | |
| #if no parquet create it | |
| try: | |
| embeddings_df = pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') | |
| except: | |
| # Load MLFPA_project-main\BERT embeddings\bert_embedding.npz | |
| embeddings_df = np.load('BERT embeddings/sbert_embedding.npz')['sbert_embedding']#np.load(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embedding.npz') | |
| # print(embeddings_df.files) # Check the keys in the .npz file | |
| # embeddings_df = [f'{embed_type.lower().replace("-", "")}_embedding'] | |
| print(embeddings_df.shape) # Check the shape of the embeddings | |
| #print data type | |
| print(type(embeddings_df)) # Check the type of the embeddings | |
| #change to pandas dataframe | |
| embeddings_df = pd.DataFrame(embeddings_df) | |
| #save as parquet | |
| embeddings_df.to_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#to_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') | |
| #load parquet | |
| embeddings_df = pd.read_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') | |
| #do the clustering lmao | |
| def scale_and_pca(embeddings_df): | |
| # Standardize the data | |
| scaler = StandardScaler() | |
| embeddings_scaled = scaler.fit_transform(embeddings_df) | |
| # Perform PCA to reduce dimensionality | |
| pca = PCA(n_components=3) | |
| embeddings_pca = pca.fit_transform(embeddings_scaled) | |
| return embeddings_pca | |
| embeddings_pca = scale_and_pca(embeddings_df) | |
| #remove embeddings_df from memory | |
| del embeddings_df | |
| # Create a 3D scatter plot of the PCA results | |
| def plot_3d_scatter(embeddings_pca): | |
| fig = plt.figure(figsize=(10, 7)) | |
| ax = fig.add_subplot(111, projection='3d') | |
| ax.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], embeddings_pca[:, 2], s=1) | |
| ax.set_xlabel('PC 1') | |
| ax.set_ylabel('PC 2') | |
| ax.set_zlabel('PC 3') | |
| plt.title('3D PCA of BERT Embeddings') | |
| plt.show() | |
| # plot_3d_scatter(embeddings_pca) | |
| # def compute_silhouette(n_clusters, data): | |
| # kmeans = KMeans(n_clusters=n_clusters, random_state=420) | |
| # labels = kmeans.fit_predict(data) | |
| # silhouette_avg = silhouette_score(data, labels) | |
| # print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}") | |
| # return silhouette_avg | |
| # silhouette_scores = [] | |
| # for i in range(2, 10): | |
| # start_time = time.time() | |
| # silhouette_scores.append(compute_silhouette(i, embeddings_pca)) | |
| # end_time = time.time() | |
| # print(f"Time taken for n_clusters = {i}: {end_time - start_time} seconds") | |
| # # Plot silhouette scores | |
| # plt.figure(figsize=(10, 6)) | |
| # plt.plot(range(2, 10), silhouette_scores, marker='o') | |
| # plt.title('Silhouette Scores for Different Cluster Sizes') | |
| # plt.xlabel('Number of Clusters') | |
| # plt.ylabel('Silhouette Score') | |
| # plt.xticks(range(2, 10)) | |
| # plt.grid() | |
| # plt.show() | |
| # # Save silhouette scores to CSV | |
| # silhouette_df = pd.DataFrame({'n_clusters': range(2, 10), 'silhouette_score': silhouette_scores}) | |
| # silhouette_df.to_csv('MLFPA_project-main/Raf_scores/silhouette_scores.csv', index=False) | |
| #save the the cluster labels for n_clusters = 5 | |
| def save_cluster_labels(n_clusters, data): | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=420) | |
| labels = kmeans.fit_predict(data) | |
| labels_df = pd.DataFrame(labels, columns=['cluster_label']) | |
| labels_df.to_csv(f'raf_clusters/cluster_labels_sbert.csv', index=False) | |
| return labels_df | |
| save_cluster_labels(5, embeddings_pca) | |
| # plot_3d_scatter(embeddings_pca) | |