| |
| import pandas as pd |
| import re |
| import joblib |
| import os |
| from datasets import load_dataset |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.svm import LinearSVC |
| from sklearn.pipeline import Pipeline |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
| |
| |
| |
| MODEL_OUTPUT = 'api/data/model_mbti.pkl' |
| |
|
|
| print("π Mengunduh dataset MBTI (7000 Data)...") |
|
|
| try: |
| |
| dataset = load_dataset("gmnsong/MBTI.csv", split="train") |
| df = pd.DataFrame(dataset) |
| |
| |
| if 'type' not in df.columns: |
| df.rename(columns={'label': 'type', 'text': 'posts'}, inplace=True) |
| |
| X = df['posts'] |
| y = df['type'] |
| print(f"β
Data siap: {len(df)} baris.") |
|
|
| except Exception as e: |
| print(f"β Error: {e}") |
| exit() |
|
|
| |
| def clean_text(text): |
| text = str(text).lower() |
| text = re.sub(r'http\S+', '', text) |
| text = re.sub(r'[^a-zA-Z\s]', '', text) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
| print("π§Ή Membersihkan data...") |
| X = X.apply(clean_text) |
|
|
| |
| print("π Melatih Model MBTI (SVM Optimized)...") |
|
|
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
| pipeline = Pipeline([ |
| ('tfidf', TfidfVectorizer( |
| max_features=15000, |
| stop_words='english', |
| ngram_range=(1, 2), |
| sublinear_tf=True |
| )), |
| ('clf', LinearSVC( |
| dual=False, |
| C=0.6, |
| class_weight='balanced' |
| )) |
| ]) |
|
|
| pipeline.fit(X_train, y_train) |
|
|
| |
| print("π Menghitung Metrik Evaluasi...") |
| predictions = pipeline.predict(X_test) |
|
|
| accuracy = accuracy_score(y_test, predictions) |
| precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted', zero_division=0) |
|
|
| print("\n" + "="*40) |
| print(" HASIL EVALUASI MODEL MBTI (FINAL)") |
| print("="*40) |
| print(f"{'Metrik':<15} | {'Skor':<10}") |
| print("-" * 30) |
| print(f"{'Akurasi':<15} | {accuracy:.3f} ({accuracy*100:.1f}%)") |
| print(f"{'Precision':<15} | {precision:.3f}") |
| print(f"{'Recall':<15} | {recall:.3f}") |
| print(f"{'F1-Score':<15} | {f1:.3f}") |
| print("="*40 + "\n") |
|
|
| os.makedirs(os.path.dirname(MODEL_OUTPUT), exist_ok=True) |
| joblib.dump(pipeline, MODEL_OUTPUT) |
| print(f"πΎ SUKSES! Model MBTI disimpan di: {MODEL_OUTPUT}") |