| |
| import pandas as pd |
| import re |
| import joblib |
| import os |
| from datasets import load_dataset |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.pipeline import Pipeline |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
| |
| |
| |
| MODEL_OUTPUT = 'api/data/model_emotion.pkl' |
| |
|
|
| print("π Mengunduh dataset GoEmotions...") |
|
|
| try: |
| dataset = load_dataset("google-research-datasets/go_emotions", "simplified", split="train") |
| df = pd.DataFrame(dataset) |
| labels_list = dataset.features['labels'].feature.names |
| |
| def get_first_label(label_ids): |
| if len(label_ids) > 0: |
| return labels_list[label_ids[0]] |
| return "neutral" |
|
|
| df['emotion_label'] = df['labels'].apply(get_first_label) |
| X = df['text'] |
| y = df['emotion_label'] |
| print(f"β
Data siap: {len(df)} baris.") |
|
|
| except Exception as e: |
| print(f"β Error: {e}") |
| exit() |
|
|
| |
| def clean_text(text): |
| text = str(text).lower() |
| text = re.sub(r'http\S+', '', text) |
| text = re.sub(r'[^a-zA-Z\s]', '', text) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
| print("π§Ή Membersihkan data emosi...") |
| X = X.apply(clean_text) |
|
|
| |
| print("π Melatih Model Emosi (Logistic Regression Fixed)...") |
|
|
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
| pipeline = Pipeline([ |
| ('tfidf', TfidfVectorizer( |
| max_features=12000, |
| stop_words='english', |
| ngram_range=(1, 2), |
| sublinear_tf=True |
| )), |
| ('clf', LogisticRegression( |
| max_iter=1000, |
| solver='lbfgs', |
| C=1.2 |
| )) |
| ]) |
|
|
| pipeline.fit(X_train, y_train) |
|
|
| |
| print("π Menghitung Metrik Evaluasi...") |
| predictions = pipeline.predict(X_test) |
|
|
| accuracy = accuracy_score(y_test, predictions) |
| precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted', zero_division=0) |
|
|
| print("\n" + "="*40) |
| print(" HASIL EVALUASI MODEL EMOSI (FINAL)") |
| print("="*40) |
| print(f"{'Metrik':<15} | {'Skor':<10}") |
| print("-" * 30) |
| print(f"{'Akurasi':<15} | {accuracy:.3f} ({accuracy*100:.1f}%)") |
| print(f"{'Precision':<15} | {precision:.3f}") |
| print(f"{'Recall':<15} | {recall:.3f}") |
| print(f"{'F1-Score':<15} | {f1:.3f}") |
| print("="*40 + "\n") |
|
|
| os.makedirs(os.path.dirname(MODEL_OUTPUT), exist_ok=True) |
| joblib.dump(pipeline, MODEL_OUTPUT) |
| print(f"πΎ SUKSES! Model Emosi disimpan di: {MODEL_OUTPUT}") |