Update app.py
Browse files
app.py
CHANGED
|
@@ -19,6 +19,8 @@ import numpy as np
|
|
| 19 |
from sklearn.linear_model import Ridge
|
| 20 |
from sklearn.preprocessing import PolynomialFeatures
|
| 21 |
from sklearn.pipeline import make_pipeline
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# --------------------------
|
| 24 |
# Initial Setup
|
|
@@ -371,6 +373,63 @@ def plot_sentiment(data, keyword):
|
|
| 371 |
st.error(f"Plotting error: {str(e)}")
|
| 372 |
return None
|
| 373 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
# --------------------------
|
| 375 |
# Main Application
|
| 376 |
# --------------------------
|
|
@@ -399,12 +458,19 @@ def main():
|
|
| 399 |
"Search keyword",
|
| 400 |
placeholder="e.g., Apple, Tesla, etc."
|
| 401 |
)
|
|
|
|
|
|
|
| 402 |
analyze_btn = st.button("Fetch & Analyze")
|
| 403 |
|
| 404 |
st.markdown("---")
|
| 405 |
st.markdown("### Options")
|
| 406 |
show_details = st.checkbox("Show detailed results", value=False)
|
| 407 |
enable_prediction = st.checkbox("Enable sentiment prediction", value=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
st.markdown("---")
|
| 409 |
|
| 410 |
# Main content
|
|
@@ -448,8 +514,8 @@ def main():
|
|
| 448 |
with st.spinner(f"Gathering data for '{keyword}'..."):
|
| 449 |
start_time = time.time()
|
| 450 |
|
| 451 |
-
reddit_data = fetch_reddit_data(keyword)
|
| 452 |
-
youtube_data = fetch_youtube_data(keyword)
|
| 453 |
|
| 454 |
if reddit_data.empty and youtube_data.empty:
|
| 455 |
st.error("No data found. Try a different keyword.")
|
|
@@ -457,9 +523,17 @@ def main():
|
|
| 457 |
|
| 458 |
combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
|
| 459 |
|
|
|
|
|
|
|
|
|
|
| 460 |
# Filter out empty or invalid texts
|
|
|
|
| 461 |
combined_data = combined_data[combined_data['text'].str.strip() != '']
|
| 462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
# Analyze in batches
|
| 464 |
analysis_results = []
|
| 465 |
for _, row in combined_data.iterrows():
|
|
@@ -469,6 +543,8 @@ def main():
|
|
| 469 |
combined_data['vader'] = [r['vader'] for r in analysis_results]
|
| 470 |
combined_data['bert'] = [r['bert'] for r in analysis_results]
|
| 471 |
combined_data['textblob'] = [r['textblob'] for r in analysis_results]
|
|
|
|
|
|
|
| 472 |
|
| 473 |
# Ensure no NaN values in sentiment scores
|
| 474 |
combined_data = combined_data.dropna(subset=['vader', 'bert', 'textblob'])
|
|
@@ -477,12 +553,15 @@ def main():
|
|
| 477 |
processing_time = time.time() - start_time
|
| 478 |
st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
|
| 479 |
|
|
|
|
|
|
|
|
|
|
| 480 |
st.subheader(f"π Overall Sentiment for '{keyword}'")
|
| 481 |
|
| 482 |
cols = st.columns(3)
|
| 483 |
avg_sentiment = combined_data['average'].mean()
|
| 484 |
-
pos_pct = (combined_data['average'] >
|
| 485 |
-
neg_pct = (combined_data['average'] <
|
| 486 |
|
| 487 |
cols[0].metric("Avg Sentiment", f"{avg_sentiment:.2f}",
|
| 488 |
"Positive" if avg_sentiment > 0 else "Negative" if avg_sentiment < 0 else "Neutral")
|
|
@@ -490,13 +569,32 @@ def main():
|
|
| 490 |
cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
|
| 491 |
|
| 492 |
st.subheader("π Content Visualization")
|
| 493 |
-
all_text = " ".join(combined_data['text'])
|
| 494 |
wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
|
| 495 |
if wordcloud_img:
|
| 496 |
st.image(wordcloud_img, use_column_width=True)
|
| 497 |
else:
|
| 498 |
st.info("No word cloud generated due to insufficient text")
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
# Filter recent data
|
| 501 |
combined_data['date'] = pd.to_datetime(combined_data['date'])
|
| 502 |
recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]
|
|
@@ -514,7 +612,7 @@ def main():
|
|
| 514 |
fig = plot_sentiment(full_data, keyword)
|
| 515 |
else:
|
| 516 |
daily_data = daily_data if daily_data is not None else recent_data[['date', 'average']].assign(type='actual')
|
| 517 |
-
fig = plot_sentiment(daily_data, keyword)
|
| 518 |
else:
|
| 519 |
daily_data = prepare_data_for_prediction(recent_data)
|
| 520 |
fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
|
|
@@ -533,12 +631,50 @@ def main():
|
|
| 533 |
else:
|
| 534 |
st.info("π Prediction: Sentiment is expected to remain stable in the next 15 days")
|
| 535 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
if show_details:
|
| 537 |
-
st.subheader("π Detailed Results")
|
| 538 |
-
st.dataframe(recent_data[['date', 'source', 'text', 'average']], use_container_width=True)
|
| 539 |
else:
|
| 540 |
st.info("No recent data found (within last 60 days).")
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
if __name__ == "__main__":
|
| 543 |
try:
|
| 544 |
nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
|
|
|
|
| 19 |
from sklearn.linear_model import Ridge
|
| 20 |
from sklearn.preprocessing import PolynomialFeatures
|
| 21 |
from sklearn.pipeline import make_pipeline
|
| 22 |
+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
|
| 23 |
+
from sklearn.model_selection import train_test_split
|
| 24 |
|
| 25 |
# --------------------------
|
| 26 |
# Initial Setup
|
|
|
|
| 373 |
st.error(f"Plotting error: {str(e)}")
|
| 374 |
return None
|
| 375 |
|
| 376 |
+
# --------------------------
|
| 377 |
+
# Evaluation & Dataset Stats
|
| 378 |
+
# --------------------------
|
| 379 |
+
|
| 380 |
+
def compute_dataset_stats(df):
|
| 381 |
+
stats = {}
|
| 382 |
+
stats['total'] = len(df)
|
| 383 |
+
try:
|
| 384 |
+
stats['start_date'] = df['date'].min()
|
| 385 |
+
stats['end_date'] = df['date'].max()
|
| 386 |
+
except:
|
| 387 |
+
stats['start_date'] = None
|
| 388 |
+
stats['end_date'] = None
|
| 389 |
+
stats['source_counts'] = df['source'].value_counts().to_dict() if 'source' in df.columns else {}
|
| 390 |
+
return stats
|
| 391 |
+
|
| 392 |
+
def prepare_eval_labels(df, pos_thresh=0.1, neg_thresh=-0.1):
|
| 393 |
+
"""Create categorical labels from average sentiment scores."""
|
| 394 |
+
df = df.copy()
|
| 395 |
+
df['label'] = df['average'].apply(lambda x: 'Positive' if x > pos_thresh else ('Negative' if x < neg_thresh else 'Neutral'))
|
| 396 |
+
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
|
| 397 |
+
df['label_num'] = df['label'].map(label_map)
|
| 398 |
+
return df
|
| 399 |
+
|
| 400 |
+
def evaluate_sentiment_model(df):
|
| 401 |
+
"""Evaluate using simple regression->rounded classification baseline and return metrics"""
|
| 402 |
+
df = df.dropna(subset=['vader', 'bert', 'textblob', 'average'])
|
| 403 |
+
if len(df) < 5:
|
| 404 |
+
return None # insufficient data
|
| 405 |
+
|
| 406 |
+
df_eval = prepare_eval_labels(df)
|
| 407 |
+
|
| 408 |
+
X = df_eval[['vader', 'bert', 'textblob']].values
|
| 409 |
+
y = df_eval['label_num'].values
|
| 410 |
+
|
| 411 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
|
| 412 |
+
|
| 413 |
+
# Using Ridge as simple baseline regressor, then rounding predictions to nearest class
|
| 414 |
+
clf = Ridge(alpha=1.0)
|
| 415 |
+
clf.fit(X_train, y_train)
|
| 416 |
+
raw_preds = clf.predict(X_test)
|
| 417 |
+
preds = np.round(raw_preds).astype(int)
|
| 418 |
+
preds = np.clip(preds, 0, 2)
|
| 419 |
+
|
| 420 |
+
acc = accuracy_score(y_test, preds)
|
| 421 |
+
precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='weighted', zero_division=0)
|
| 422 |
+
cm = confusion_matrix(y_test, preds)
|
| 423 |
+
|
| 424 |
+
return {
|
| 425 |
+
'accuracy': acc,
|
| 426 |
+
'precision': precision,
|
| 427 |
+
'recall': recall,
|
| 428 |
+
'f1': f1,
|
| 429 |
+
'confusion_matrix': cm,
|
| 430 |
+
'class_map': {'0': 'Negative', '1': 'Neutral', '2': 'Positive'}
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
# --------------------------
|
| 434 |
# Main Application
|
| 435 |
# --------------------------
|
|
|
|
| 458 |
"Search keyword",
|
| 459 |
placeholder="e.g., Apple, Tesla, etc."
|
| 460 |
)
|
| 461 |
+
reddit_limit = st.number_input("Reddit results", min_value=5, max_value=200, value=30, step=5)
|
| 462 |
+
youtube_limit = st.number_input("YouTube results", min_value=5, max_value=50, value=30, step=5)
|
| 463 |
analyze_btn = st.button("Fetch & Analyze")
|
| 464 |
|
| 465 |
st.markdown("---")
|
| 466 |
st.markdown("### Options")
|
| 467 |
show_details = st.checkbox("Show detailed results", value=False)
|
| 468 |
enable_prediction = st.checkbox("Enable sentiment prediction", value=True)
|
| 469 |
+
show_evaluation = st.checkbox("Run model evaluation (adds compute)", value=True)
|
| 470 |
+
st.markdown("---")
|
| 471 |
+
st.markdown("### Experimental Settings")
|
| 472 |
+
pos_thresh = st.number_input("Positive threshold", value=0.1, step=0.05, format="%.2f")
|
| 473 |
+
neg_thresh = st.number_input("Negative threshold", value=-0.1, step=0.05, format="%.2f")
|
| 474 |
st.markdown("---")
|
| 475 |
|
| 476 |
# Main content
|
|
|
|
| 514 |
with st.spinner(f"Gathering data for '{keyword}'..."):
|
| 515 |
start_time = time.time()
|
| 516 |
|
| 517 |
+
reddit_data = fetch_reddit_data(keyword, limit=reddit_limit)
|
| 518 |
+
youtube_data = fetch_youtube_data(keyword, limit=youtube_limit)
|
| 519 |
|
| 520 |
if reddit_data.empty and youtube_data.empty:
|
| 521 |
st.error("No data found. Try a different keyword.")
|
|
|
|
| 523 |
|
| 524 |
combined_data = pd.concat([reddit_data, youtube_data], ignore_index=True)
|
| 525 |
|
| 526 |
+
# Basic dataset stats BEFORE cleaning
|
| 527 |
+
raw_stats = compute_dataset_stats(combined_data)
|
| 528 |
+
|
| 529 |
# Filter out empty or invalid texts
|
| 530 |
+
combined_data['text'] = combined_data['text'].fillna('').astype(str)
|
| 531 |
combined_data = combined_data[combined_data['text'].str.strip() != '']
|
| 532 |
|
| 533 |
+
# Ensure date column exists and is datetime
|
| 534 |
+
combined_data['date'] = pd.to_datetime(combined_data['date'], errors='coerce')
|
| 535 |
+
combined_data = combined_data.dropna(subset=['date'])
|
| 536 |
+
|
| 537 |
# Analyze in batches
|
| 538 |
analysis_results = []
|
| 539 |
for _, row in combined_data.iterrows():
|
|
|
|
| 543 |
combined_data['vader'] = [r['vader'] for r in analysis_results]
|
| 544 |
combined_data['bert'] = [r['bert'] for r in analysis_results]
|
| 545 |
combined_data['textblob'] = [r['textblob'] for r in analysis_results]
|
| 546 |
+
combined_data['bert_label'] = [r['bert_label'] for r in analysis_results]
|
| 547 |
+
combined_data['bert_confidence'] = [r['bert_confidence'] for r in analysis_results]
|
| 548 |
|
| 549 |
# Ensure no NaN values in sentiment scores
|
| 550 |
combined_data = combined_data.dropna(subset=['vader', 'bert', 'textblob'])
|
|
|
|
| 553 |
processing_time = time.time() - start_time
|
| 554 |
st.success(f"Analyzed {len(combined_data)} sources in {processing_time:.2f} seconds")
|
| 555 |
|
| 556 |
+
# Dataset statistics AFTER cleaning
|
| 557 |
+
cleaned_stats = compute_dataset_stats(combined_data)
|
| 558 |
+
|
| 559 |
st.subheader(f"π Overall Sentiment for '{keyword}'")
|
| 560 |
|
| 561 |
cols = st.columns(3)
|
| 562 |
avg_sentiment = combined_data['average'].mean()
|
| 563 |
+
pos_pct = (combined_data['average'] > pos_thresh).mean() * 100
|
| 564 |
+
neg_pct = (combined_data['average'] < neg_thresh).mean() * 100
|
| 565 |
|
| 566 |
cols[0].metric("Avg Sentiment", f"{avg_sentiment:.2f}",
|
| 567 |
"Positive" if avg_sentiment > 0 else "Negative" if avg_sentiment < 0 else "Neutral")
|
|
|
|
| 569 |
cols[2].metric("Negative Content", f"{neg_pct:.1f}%")
|
| 570 |
|
| 571 |
st.subheader("π Content Visualization")
|
| 572 |
+
all_text = " ".join(combined_data['text'].tolist())
|
| 573 |
wordcloud_img = f'data:image/png;base64,{generate_wordcloud(all_text)}'
|
| 574 |
if wordcloud_img:
|
| 575 |
st.image(wordcloud_img, use_column_width=True)
|
| 576 |
else:
|
| 577 |
st.info("No word cloud generated due to insufficient text")
|
| 578 |
|
| 579 |
+
# Show dataset stats panel
|
| 580 |
+
with st.expander("π Dataset Statistics (raw vs cleaned)"):
|
| 581 |
+
st.write("**Before cleaning**")
|
| 582 |
+
st.write(f"- Total collected: {raw_stats['total']}")
|
| 583 |
+
st.write(f"- Date range: {raw_stats['start_date']} -> {raw_stats['end_date']}")
|
| 584 |
+
st.write(f"- Source counts: {raw_stats['source_counts']}")
|
| 585 |
+
|
| 586 |
+
st.write("**After cleaning**")
|
| 587 |
+
st.write(f"- Total after cleaning: {cleaned_stats['total']}")
|
| 588 |
+
st.write(f"- Date range: {cleaned_stats['start_date']} -> {cleaned_stats['end_date']}")
|
| 589 |
+
st.write(f"- Source counts: {cleaned_stats['source_counts']}")
|
| 590 |
+
|
| 591 |
+
st.write("**Sentiment score summary (average)**")
|
| 592 |
+
st.write(combined_data['average'].describe().to_frame().T)
|
| 593 |
+
|
| 594 |
+
st.write("**Sentiment distribution histogram**")
|
| 595 |
+
fig_hist = px.histogram(combined_data, x='average', nbins=30, title="Average Sentiment Distribution")
|
| 596 |
+
st.plotly_chart(fig_hist, use_container_width=True)
|
| 597 |
+
|
| 598 |
# Filter recent data
|
| 599 |
combined_data['date'] = pd.to_datetime(combined_data['date'])
|
| 600 |
recent_data = combined_data[combined_data['date'] >= (datetime.now() - timedelta(days=60))]
|
|
|
|
| 612 |
fig = plot_sentiment(full_data, keyword)
|
| 613 |
else:
|
| 614 |
daily_data = daily_data if daily_data is not None else recent_data[['date', 'average']].assign(type='actual')
|
| 615 |
+
fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
|
| 616 |
else:
|
| 617 |
daily_data = prepare_data_for_prediction(recent_data)
|
| 618 |
fig = plot_sentiment(daily_data.assign(type='actual') if daily_data is not None else recent_data[['date', 'average']].assign(type='actual'), keyword)
|
|
|
|
| 631 |
else:
|
| 632 |
st.info("π Prediction: Sentiment is expected to remain stable in the next 15 days")
|
| 633 |
|
| 634 |
+
if show_evaluation:
|
| 635 |
+
with st.spinner("Running evaluation..."):
|
| 636 |
+
eval_results = evaluate_sentiment_model(recent_data)
|
| 637 |
+
if eval_results is None:
|
| 638 |
+
st.info("Not enough data points for evaluation.")
|
| 639 |
+
else:
|
| 640 |
+
st.subheader("π Model Evaluation Results (Baseline Ridge)")
|
| 641 |
+
st.write(f"**Accuracy:** {eval_results['accuracy']:.3f}")
|
| 642 |
+
st.write(f"**Precision:** {eval_results['precision']:.3f}")
|
| 643 |
+
st.write(f"**Recall:** {eval_results['recall']:.3f}")
|
| 644 |
+
st.write(f"**F1-score:** {eval_results['f1']:.3f}")
|
| 645 |
+
st.write("**Confusion Matrix:**")
|
| 646 |
+
st.write(eval_results['confusion_matrix'])
|
| 647 |
+
|
| 648 |
if show_details:
|
| 649 |
+
st.subheader("π Detailed Results (Recent Data)")
|
| 650 |
+
st.dataframe(recent_data[['date', 'source', 'text', 'vader', 'bert', 'textblob', 'average']], use_container_width=True)
|
| 651 |
else:
|
| 652 |
st.info("No recent data found (within last 60 days).")
|
| 653 |
+
|
| 654 |
+
# Experimental details panel outside main flow so reviewers see it even without running analyses
|
| 655 |
+
with st.expander("π§ͺ Experimental Details & Settings (Methodology)"):
|
| 656 |
+
st.markdown("""
|
| 657 |
+
**Preprocessing Steps**
|
| 658 |
+
- Remove empty posts and NAs.
|
| 659 |
+
- Truncate text to 2000 characters to keep BERT inference performant (BERT uses <=512 tokens).
|
| 660 |
+
- Convert model outputs to numeric ranges: BERT labels mapped to [-1, -0.5, 0, 0.5, 1].
|
| 661 |
+
- Aggregate VADER, BERT, and TextBlob by mean to create a fused 'average' sentiment score.
|
| 662 |
+
|
| 663 |
+
**Model Choices & Hyperparameters**
|
| 664 |
+
- VADER: default lexicon (lexicon-based sentiment for short social text).
|
| 665 |
+
- BERT: `nlptown/bert-base-multilingual-uncased-sentiment` used for multilingual rating-style classification.
|
| 666 |
+
- Regression baseline for temporal prediction: PolynomialFeatures(degree=2) + Ridge(alpha=1.0).
|
| 667 |
+
- Evaluation baseline: Ridge(alpha=1.0) regressor trained on [vader, bert, textblob] then rounded to class labels.
|
| 668 |
+
|
| 669 |
+
**Train/Test Split**
|
| 670 |
+
- Standard 80% train / 20% test split (random_state=42) used for evaluation experiments.
|
| 671 |
+
|
| 672 |
+
**Labeling thresholds**
|
| 673 |
+
- Positive threshold: configurable (default 0.1)
|
| 674 |
+
- Negative threshold: configurable (default -0.1)
|
| 675 |
+
""")
|
| 676 |
+
st.markdown("**Notes for reviewers:** Add more advanced time-series models (Prophet, ARIMA, LSTM) if temporal forecasting accuracy is critical. Current Ridge polynomial baseline is intentionally simple and explained in methodology.")
|
| 677 |
+
|
| 678 |
if __name__ == "__main__":
|
| 679 |
try:
|
| 680 |
nltk.data.path.append(os.path.join(os.path.expanduser("~"), "nltk_data"))
|