Spaces:

Soundaryasos
/

Socialmediasentimentanalysis

Running

App Files Files Community

Soundaryasos commited on 13 days ago

Commit

e2278a6

verified ·

1 Parent(s): 003cbcb

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -9

app.py CHANGED Viewed

@@ -19,6 +19,8 @@ import numpy as np
 from sklearn.linear_model import Ridge
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import make_pipeline
 # --------------------------
 # Initial Setup
@@ -371,6 +373,63 @@ def plot_sentiment(data, keyword):
         st.error(f"Plotting error: {str(e)}")
         return None
 # --------------------------
 # Main Application
 # --------------------------
@@ -399,12 +458,19 @@ def main():
                 "Search keyword",
                 placeholder="e.g., Apple, Tesla, etc."
             )
             analyze_btn = st.button("Fetch & Analyze")
         st.markdown("---")
         st.markdown("### Options")
         show_details = st.checkbox("Show detailed results", value=False)
         enable_prediction = st.checkbox("Enable sentiment prediction", value=True)
         st.markdown("---")
     # Main content
@@ -448,8 +514,8 @@ def main():
             with st.spinner(f"Gathering data for '{keyword}'..."):
                 start_time = time.time()
-                reddit_data = fetch_reddit_data(keyword)
-                youtube_data = fetch_youtube_data(keyword)
                 if reddit_data.empty and youtube_data.empty:
                     st.error("No data found. Try a different keyword.")
@@ -457,9 +523,17 @@ def main():
                 combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
                 # Filter out empty or invalid texts
                 combined_data = combined_data[combined_data['text'].str.strip() != '']
                 # Analyze in batches
                 analysis_results = []
                 for _, row in combined_data.iterrows():
@@ -469,6 +543,8 @@ def main():
                 combined_data['vader'] = [r['vader'] for r in analysis_results]
                 combined_data['bert'] = [r['bert'] for r in analysis_results]
                 combined_data['textblob'] = [r['textblob'] for r in analysis_results]
                 # Ensure no NaN values in sentiment scores
                 combined_data = combined_data.dropna(subset=['vader', 'bert', 'textblob'])
@@ -477,12 +553,15 @@ def main():
                 processing_time = time.time() - start_time
                 st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
                 st.subheader(f"📈 Overall Sentiment for '{keyword}'")
                 cols = st.columns(3)
                 avg_sentiment = combined_data['average'].mean()
-                pos_pct = (combined_data['average'] > 0.1).mean() * 100
-                neg_pct = (combined_data['average'] < -0.1).mean() * 100
                 cols[0].metric("Avg Sentiment", f"{avg_sentiment:.2f}",
                              "Positive" if avg_sentiment > 0 else "Negative" if avg_sentiment < 0 else "Neutral")
@@ -490,13 +569,32 @@ def main():
                 cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
                 st.subheader("📊 Content Visualization")
-                all_text = " ".join(combined_data['text'])
                 wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
                 if wordcloud_img:
                     st.image(wordcloud_img, use_column_width=True)
                 else:
                     st.info("No word cloud generated due to insufficient text")
                 # Filter recent data
                 combined_data['date'] = pd.to_datetime(combined_data['date'])
                 recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]
@@ -514,7 +612,7 @@ def main():
                                 fig = plot_sentiment(full_data, keyword)
                             else:
                                 daily_data = daily_data if daily_data is not None else recent_data[['date', 'average']].assign(type='actual')
-                                fig = plot_sentiment(daily_data, keyword)
                     else:
                         daily_data = prepare_data_for_prediction(recent_data)
                         fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
@@ -533,12 +631,50 @@ def main():
                         else:
                             st.info("📊 Prediction: Sentiment is expected to remain stable in the next 15 days")
                     if show_details:
-                        st.subheader("🔍 Detailed Results")
-                        st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
                 else:
                     st.info("No recent data found (within last 60 days).")
 if __name__ == "__main__":
     try:
         nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))

 from sklearn.linear_model import Ridge
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import make_pipeline
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
+from sklearn.model_selection import train_test_split
 # --------------------------
 # Initial Setup
         st.error(f"Plotting error: {str(e)}")
         return None
+# --------------------------
+# Evaluation & Dataset Stats
+# --------------------------
+def compute_dataset_stats(df):
+    stats = {}
+    stats['total'] = len(df)
+    try:
+        stats['start_date'] = df['date'].min()
+        stats['end_date'] = df['date'].max()
+    except:
+        stats['start_date'] = None
+        stats['end_date'] = None
+    stats['source_counts'] = df['source'].value_counts().to_dict() if 'source' in df.columns else {}
+    return stats
+def prepare_eval_labels(df, pos_thresh=0.1, neg_thresh=-0.1):
+    """Create categorical labels from average sentiment scores."""
+    df = df.copy()
+    df['label'] = df['average'].apply(lambda x: 'Positive' if x > pos_thresh else ('Negative' if x < neg_thresh else 'Neutral'))
+    label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
+    df['label_num'] = df['label'].map(label_map)
+    return df
+def evaluate_sentiment_model(df):
+    """Evaluate using simple regression->rounded classification baseline and return metrics"""
+    df = df.dropna(subset=['vader', 'bert', 'textblob', 'average'])
+    if len(df) < 5:
+        return None  # insufficient data
+    df_eval = prepare_eval_labels(df)
+    X = df_eval[['vader', 'bert', 'textblob']].values
+    y = df_eval['label_num'].values
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
+    # Using Ridge as simple baseline regressor, then rounding predictions to nearest class
+    clf = Ridge(alpha=1.0)
+    clf.fit(X_train, y_train)
+    raw_preds = clf.predict(X_test)
+    preds = np.round(raw_preds).astype(int)
+    preds = np.clip(preds, 0, 2)
+    acc = accuracy_score(y_test, preds)
+    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='weighted', zero_division=0)
+    cm = confusion_matrix(y_test, preds)
+    return {
+        'accuracy': acc,
+        'precision': precision,
+        'recall': recall,
+        'f1': f1,
+        'confusion_matrix': cm,
+        'class_map': {'0': 'Negative', '1': 'Neutral', '2': 'Positive'}
+    }
 # --------------------------
 # Main Application
 # --------------------------
                 "Search keyword",
                 placeholder="e.g., Apple, Tesla, etc."
             )
+            reddit_limit = st.number_input("Reddit results", min_value=5, max_value=200, value=30, step=5)
+            youtube_limit = st.number_input("YouTube results", min_value=5, max_value=50, value=30, step=5)
             analyze_btn = st.button("Fetch & Analyze")
         st.markdown("---")
         st.markdown("### Options")
         show_details = st.checkbox("Show detailed results", value=False)
         enable_prediction = st.checkbox("Enable sentiment prediction", value=True)
+        show_evaluation = st.checkbox("Run model evaluation (adds compute)", value=True)
+        st.markdown("---")
+        st.markdown("### Experimental Settings")
+        pos_thresh = st.number_input("Positive threshold", value=0.1, step=0.05, format="%.2f")
+        neg_thresh = st.number_input("Negative threshold", value=-0.1, step=0.05, format="%.2f")
         st.markdown("---")
     # Main content
             with st.spinner(f"Gathering data for '{keyword}'..."):
                 start_time = time.time()
+                reddit_data = fetch_reddit_data(keyword, limit=reddit_limit)
+                youtube_data = fetch_youtube_data(keyword, limit=youtube_limit)
                 if reddit_data.empty and youtube_data.empty:
                     st.error("No data found. Try a different keyword.")
                 combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
+                # Basic dataset stats BEFORE cleaning
+                raw_stats = compute_dataset_stats(combined_data)
                 # Filter out empty or invalid texts
+                combined_data['text'] = combined_data['text'].fillna('').astype(str)
                 combined_data = combined_data[combined_data['text'].str.strip() != '']
+                # Ensure date column exists and is datetime
+                combined_data['date'] = pd.to_datetime(combined_data['date'], errors='coerce')
+                combined_data = combined_data.dropna(subset=['date'])
                 # Analyze in batches
                 analysis_results = []
                 for _, row in combined_data.iterrows():
                 combined_data['vader'] = [r['vader'] for r in analysis_results]
                 combined_data['bert'] = [r['bert'] for r in analysis_results]
                 combined_data['textblob'] = [r['textblob'] for r in analysis_results]
+                combined_data['bert_label'] = [r['bert_label'] for r in analysis_results]
+                combined_data['bert_confidence'] = [r['bert_confidence'] for r in analysis_results]
                 # Ensure no NaN values in sentiment scores
                 combined_data = combined_data.dropna(subset=['vader', 'bert', 'textblob'])
                 processing_time = time.time() - start_time
                 st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
+                # Dataset statistics AFTER cleaning
+                cleaned_stats = compute_dataset_stats(combined_data)
                 st.subheader(f"📈 Overall Sentiment for '{keyword}'")
                 cols = st.columns(3)
                 avg_sentiment = combined_data['average'].mean()
+                pos_pct = (combined_data['average'] > pos_thresh).mean() * 100
+                neg_pct = (combined_data['average'] < neg_thresh).mean() * 100
                 cols[0].metric("Avg Sentiment", f"{avg_sentiment:.2f}",
                              "Positive" if avg_sentiment > 0 else "Negative" if avg_sentiment < 0 else "Neutral")
                 cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
                 st.subheader("📊 Content Visualization")
+                all_text = " ".join(combined_data['text'].tolist())
                 wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
                 if wordcloud_img:
                     st.image(wordcloud_img, use_column_width=True)
                 else:
                     st.info("No word cloud generated due to insufficient text")
+                # Show dataset stats panel
+                with st.expander("📚 Dataset Statistics (raw vs cleaned)"):
+                    st.write("**Before cleaning**")
+                    st.write(f"- Total collected: {raw_stats['total']}")
+                    st.write(f"- Date range: {raw_stats['start_date']} -> {raw_stats['end_date']}")
+                    st.write(f"- Source counts: {raw_stats['source_counts']}")
+                    st.write("**After cleaning**")
+                    st.write(f"- Total after cleaning: {cleaned_stats['total']}")
+                    st.write(f"- Date range: {cleaned_stats['start_date']} -> {cleaned_stats['end_date']}")
+                    st.write(f"- Source counts: {cleaned_stats['source_counts']}")
+                    st.write("**Sentiment score summary (average)**")
+                    st.write(combined_data['average'].describe().to_frame().T)
+                    st.write("**Sentiment distribution histogram**")
+                    fig_hist = px.histogram(combined_data, x='average', nbins=30, title="Average Sentiment Distribution")
+                    st.plotly_chart(fig_hist, use_container_width=True)
                 # Filter recent data
                 combined_data['date'] = pd.to_datetime(combined_data['date'])
                 recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]
                                 fig = plot_sentiment(full_data, keyword)
                             else:
                                 daily_data = daily_data if daily_data is not None else recent_data[['date', 'average']].assign(type='actual')
+                                fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
                     else:
                         daily_data = prepare_data_for_prediction(recent_data)
                         fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
                         else:
                             st.info("📊 Prediction: Sentiment is expected to remain stable in the next 15 days")
+                    if show_evaluation:
+                        with st.spinner("Running evaluation..."):
+                            eval_results = evaluate_sentiment_model(recent_data)
+                            if eval_results is None:
+                                st.info("Not enough data points for evaluation.")
+                            else:
+                                st.subheader("📏 Model Evaluation Results (Baseline Ridge)")
+                                st.write(f"**Accuracy:** {eval_results['accuracy']:.3f}")
+                                st.write(f"**Precision:** {eval_results['precision']:.3f}")
+                                st.write(f"**Recall:** {eval_results['recall']:.3f}")
+                                st.write(f"**F1-score:** {eval_results['f1']:.3f}")
+                                st.write("**Confusion Matrix:**")
+                                st.write(eval_results['confusion_matrix'])
                     if show_details:
+                        st.subheader("🔍 Detailed Results (Recent Data)")
+                        st.dataframe(recent_data[['date', 'source', 'text', 'vader', 'bert', 'textblob', 'average']], use_container_width=True)
                 else:
                     st.info("No recent data found (within last 60 days).")
+    # Experimental details panel outside main flow so reviewers see it even without running analyses
+    with st.expander("🧪 Experimental Details & Settings (Methodology)"):
+        st.markdown("""
+        **Preprocessing Steps**
+        - Remove empty posts and NAs.
+        - Truncate text to 2000 characters to keep BERT inference performant (BERT uses <=512 tokens).
+        - Convert model outputs to numeric ranges: BERT labels mapped to [-1, -0.5, 0, 0.5, 1].
+        - Aggregate VADER, BERT, and TextBlob by mean to create a fused 'average' sentiment score.
+        **Model Choices & Hyperparameters**
+        - VADER: default lexicon (lexicon-based sentiment for short social text).
+        - BERT: `nlptown/bert-base-multilingual-uncased-sentiment` used for multilingual rating-style classification.
+        - Regression baseline for temporal prediction: PolynomialFeatures(degree=2) + Ridge(alpha=1.0).
+        - Evaluation baseline: Ridge(alpha=1.0) regressor trained on [vader, bert, textblob] then rounded to class labels.
+        **Train/Test Split**
+        - Standard 80% train / 20% test split (random_state=42) used for evaluation experiments.
+        **Labeling thresholds**
+        - Positive threshold: configurable (default 0.1)
+        - Negative threshold: configurable (default -0.1)
+        """)
+        st.markdown("**Notes for reviewers:** Add more advanced time-series models (Prophet, ARIMA, LSTM) if temporal forecasting accuracy is critical. Current Ridge polynomial baseline is intentionally simple and explained in methodology.")
 if __name__ == "__main__":
     try:
         nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))