Rashidbm commited on
Commit
86e9b95
·
unverified ·
1 Parent(s): 05a34c2

Update train_hybrid_model.py

Browse files
Hybrid_model_code/train_hybrid_model.py CHANGED
@@ -5,57 +5,46 @@ from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.model_selection import train_test_split
6
  from xgboost import XGBClassifier
7
  from sklearn.metrics import accuracy_score, classification_report
8
- from scipy.sparse import hstack, csr_matrix # لدمج مصفوفات البيانات المتفرقة
9
 
10
- # --- 1. الإعدادات ---
11
- # ملف الخصائص اللغوية (الناتج من Farasa)
12
  LINGUISTIC_FILE = "features_gemini_vs_human_augmented.csv"
13
- # ملف النصوص الخام (لاستخراج N-Grams)
14
  RAW_DATA_FILE = "merged_dataset_clean2.csv"
15
  COL_HUMAN = "human_collected_dataset"
16
  COL_GEMINI = "gemini_rephrased_v2_5"
17
 
18
- print("🚀 Starting Hybrid Model Training (Farasa + N-Grams)...")
19
 
20
- # --- 2. تحميل وتجهيز البيانات ---
21
-
22
- # أ. تحميل الخصائص اللغوية
23
  if not os.path.exists(LINGUISTIC_FILE):
24
- print(f"Error: Linguistic features file '{LINGUISTIC_FILE}' not found. Run previous pipeline first.")
25
  exit()
26
 
27
  df_features = pd.read_csv(LINGUISTIC_FILE)
28
  df_features.dropna(inplace=True)
29
  X_linguistic = df_features.drop(columns=['label'])
30
  y = df_features['label']
31
- print(f"Loaded Linguistic Features: {len(X_linguistic)} samples.")
32
-
33
 
34
- # ب. تحميل النصوص الخام (لـ N-Grams)
35
  try:
36
  df_raw = pd.read_csv(RAW_DATA_FILE)
37
  df_raw.columns = df_raw.columns.str.strip()
38
 
39
- # دمج النصوص الخام بنفس الترتيب
40
  df_human = pd.DataFrame({'text': df_raw[COL_HUMAN]})
41
  df_ai = pd.DataFrame({'text': df_raw[COL_GEMINI]})
42
  df_text = pd.concat([df_human, df_ai], ignore_index=True)
43
 
44
- # يجب أن تتطابق الأحجام
45
  min_len = min(len(df_features), len(df_text))
46
  df_text = df_text.iloc[:min_len]
47
  X_linguistic = X_linguistic.iloc[:min_len]
48
  y = y.iloc[:min_len]
49
 
50
  X_text = df_text['text'].astype(str)
51
- print(f"Loaded Raw Text: {len(X_text)} samples (Synced).")
52
 
53
  except Exception as e:
54
- print(f"Error loading raw text data: {e}")
55
  exit()
56
 
57
- # 3. استخراج خصائص N-Grams
58
- print("⚙️ Generating N-Gram Features (TF-IDF Character N-grams)...")
59
 
60
  tfidf = TfidfVectorizer(
61
  analyzer='char',
@@ -64,24 +53,19 @@ tfidf = TfidfVectorizer(
64
  min_df=5
65
  )
66
  X_ngrams = tfidf.fit_transform(X_text)
67
- print(f"N-Grams Features Shape: {X_ngrams.shape}")
68
 
69
- # 4. دمج الخصائص (Hybrid Concatenation)
70
- # نحول خصائص فراسة إلى مصفوفة متفرقة (Sparse Matrix) لدمجها مع TF-IDF
71
  X_linguistic_sparse = csr_matrix(X_linguistic.values)
72
 
73
- # الدمج الأفقي (الخصائص اللغوية + خصائص N-Grams)
74
  X_hybrid = hstack([X_ngrams, X_linguistic_sparse])
75
 
76
- print(f"Hybrid Dataset Shape: {X_hybrid.shape}")
77
 
78
- # 5. تقسيم البيانات
79
  X_train, X_test, y_train, y_test = train_test_split(
80
  X_hybrid, y, test_size=0.2, random_state=42, stratify=y
81
  )
82
 
83
- # 6. تدريب XGBoost (باستخدام أفضل المعلمات التي وجدتها سابقاً)
84
- print("🤖 Training XGBoost Hybrid Model...")
85
 
86
  model = XGBClassifier(
87
  n_estimators=500,
@@ -93,13 +77,12 @@ model = XGBClassifier(
93
  )
94
  model.fit(X_train, y_train)
95
 
96
- # 7. التقييم
97
  y_pred = model.predict(X_test)
98
  accuracy = accuracy_score(y_test, y_pred)
99
 
100
  print("\n" + "="*50)
101
- print(f"🏆 Final Hybrid Model Accuracy: {accuracy * 100:.2f}%")
102
  print("="*50)
103
 
104
- print("\n📋 Classification Report:")
105
- print(classification_report(y_test, y_pred, target_names=['Human', 'AI']))
 
5
  from sklearn.model_selection import train_test_split
6
  from xgboost import XGBClassifier
7
  from sklearn.metrics import accuracy_score, classification_report
8
+ from scipy.sparse import hstack, csr_matrix
9
 
 
 
10
  LINGUISTIC_FILE = "features_gemini_vs_human_augmented.csv"
 
11
  RAW_DATA_FILE = "merged_dataset_clean2.csv"
12
  COL_HUMAN = "human_collected_dataset"
13
  COL_GEMINI = "gemini_rephrased_v2_5"
14
 
15
+ print("Starting hybrid model training (Farasa + N-grams)...")
16
 
 
 
 
17
  if not os.path.exists(LINGUISTIC_FILE):
18
+ print(f"Error: linguistic features file '{LINGUISTIC_FILE}' not found. Run the previous pipeline first.")
19
  exit()
20
 
21
  df_features = pd.read_csv(LINGUISTIC_FILE)
22
  df_features.dropna(inplace=True)
23
  X_linguistic = df_features.drop(columns=['label'])
24
  y = df_features['label']
25
+ print(f"Loaded linguistic features: {len(X_linguistic)} samples.")
 
26
 
 
27
  try:
28
  df_raw = pd.read_csv(RAW_DATA_FILE)
29
  df_raw.columns = df_raw.columns.str.strip()
30
 
 
31
  df_human = pd.DataFrame({'text': df_raw[COL_HUMAN]})
32
  df_ai = pd.DataFrame({'text': df_raw[COL_GEMINI]})
33
  df_text = pd.concat([df_human, df_ai], ignore_index=True)
34
 
 
35
  min_len = min(len(df_features), len(df_text))
36
  df_text = df_text.iloc[:min_len]
37
  X_linguistic = X_linguistic.iloc[:min_len]
38
  y = y.iloc[:min_len]
39
 
40
  X_text = df_text['text'].astype(str)
41
+ print(f"Loaded raw text: {len(X_text)} samples (synced).")
42
 
43
  except Exception as e:
44
+ print(f"Error loading raw text data: {e}")
45
  exit()
46
 
47
+ print("Generating N-gram features (TF-IDF character n-grams)...")
 
48
 
49
  tfidf = TfidfVectorizer(
50
  analyzer='char',
 
53
  min_df=5
54
  )
55
  X_ngrams = tfidf.fit_transform(X_text)
56
+ print(f"N-gram features shape: {X_ngrams.shape}")
57
 
 
 
58
  X_linguistic_sparse = csr_matrix(X_linguistic.values)
59
 
 
60
  X_hybrid = hstack([X_ngrams, X_linguistic_sparse])
61
 
62
+ print(f"Hybrid dataset shape: {X_hybrid.shape}")
63
 
 
64
  X_train, X_test, y_train, y_test = train_test_split(
65
  X_hybrid, y, test_size=0.2, random_state=42, stratify=y
66
  )
67
 
68
+ print("Training XGBoost hybrid model...")
 
69
 
70
  model = XGBClassifier(
71
  n_estimators=500,
 
77
  )
78
  model.fit(X_train, y_train)
79
 
 
80
  y_pred = model.predict(X_test)
81
  accuracy = accuracy_score(y_test, y_pred)
82
 
83
  print("\n" + "="*50)
84
+ print(f"Final hybrid model accuracy: {accuracy * 100:.2f}%")
85
  print("="*50)
86
 
87
+ print("\nClassification report:")
88
+ print(classification_report(y_test, y_pred, target_names=['Human', 'AI']))