Text-Sentiment-Classifier/app.py at main · namitabagri/Text-Sentiment-Classifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import streamlit as st
import pandas as pd
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from category_encoders import TargetEncoder
from scipy.sparse import hstack
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD


st.set_page_config(page_title="Twitter Sentiment Classifier", layout="wide")
# [CORRECTION 1] Simplified NLTK resource download.


# [CORRECTION] Add all necessary NLTK resource downloads.
# This ensures the deployment environment matches your notebook.
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt_tab', quiet=True) # This was missing and caused the error
nltk.download('averaged_perceptron_tagger_eng', quiet=True) # Also missing from notebook


# Load models and encoders
@st.cache_resource
def load_models():
    """Load all necessary models, vectorizers, and transformers from disk."""
    model = joblib.load('sentiment_model.pkl')
    vectorizer = joblib.load('count_vectorizer.pkl')
    target_encoder = joblib.load('target_encoder.pkl')

    # [CORRECTION 2] Load the FITTED SVD transformer from the notebook.
    # This is crucial for making correct predictions.
    svd = joblib.load('svd_transformer.pkl')

    return model, vectorizer, target_encoder, svd


model, vectorizer, target_encoder, svd = load_models()


# Text cleaning function (no changes needed here)
def clean(text):
    """Cleans and preprocesses a single text string."""
    lm = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    no_punct = re.sub(r'[^a-z\\s]', '', text)
    words = word_tokenize(no_punct)
    words = [i for i in words if i not in stop_words]
    pos_tags = nltk.pos_tag(words)
    words = [
        lm.lemmatize(word, pos='v') if tag.startswith('V') else
        lm.lemmatize(word, pos='n') if tag.startswith('N') else
        lm.lemmatize(word, pos='a') if tag.startswith('R') else
        lm.lemmatize(word)
        for word, tag in pos_tags
    ]
    clean_words = ' '.join(words)
    return clean_words


# Prediction function
def predict_sentiment(text, title):
    """Predicts sentiment for a given text and title."""
    # Clean the text
    cleaned_text = clean(text)

    # Vectorize the text using the loaded vectorizer
    text_vec = vectorizer.transform([cleaned_text])

    # Encode the title using the loaded target encoder
    title_df = pd.DataFrame({'title': [title]})
    title_encoded = target_encoder.transform(title_df)

    # Combine features
    features = hstack([title_encoded, text_vec]).tocsr()

    # Normalize features
    normalizer = Normalizer(norm='l2')
    features_normalized = normalizer.transform(features)

    # [CORRECTION 3] Use the loaded SVD object to TRANSFORM the new data.
    # We must not use .fit() or .fit_transform() here. The SVD model is already trained.
    features_reduced = svd.transform(features_normalized)

    # Make prediction
    prediction = model.predict(features_reduced)
    probabilities = model.predict_proba(features_reduced)

    return prediction[0], probabilities


# --- Streamlit UI (no changes needed here) ---

st.title("🐦 Twitter Sentiment Classifier")
st.write("This app predicts the sentiment of tweets about various products and companies.")
st.markdown("---")

col1, col2 = st.columns(2)

with col1:
    title_options = [
        'Borderlands', 'Facebook', 'Amazon', 'Microsoft', 'Google', 'CallOfDuty',
        'TomClancysRainbowSix', 'MaddenNFL', 'FIFA', 'AssassinsCreed', 'CS-GO'
    ]
    title = st.selectbox(
        "Select the topic/company of the tweet:",
        options=title_options
    )

with col2:
    user_input = st.text_area("Enter the tweet text to analyze:", "I am really loving the new update, it's so good!",
                              height=150)

if st.button("Analyze Sentiment", type="primary"):
    if user_input and title:
        with st.spinner('Analyzing...'):
            prediction, probabilities = predict_sentiment(user_input, title)

            st.subheader("Prediction Result")

            if prediction == 'Positive':
                st.success(f"**Predicted Sentiment: {prediction}** 👍")
            elif prediction == 'Negative':
                st.error(f"**Predicted Sentiment: {prediction}** 👎")
            elif prediction == 'Neutral':
                st.info(f"**Predicted Sentiment: {prediction}** 😐")
            else:  # Irrelevant
                st.warning(f"**Predicted Sentiment: {prediction}** 🤷")

            st.write("**Probability Distribution:**")
            prob_df = pd.DataFrame(probabilities, columns=model.classes_, index=["Probability"]).T
            prob_df = prob_df.sort_values(by="Probability", ascending=False)
            st.bar_chart(prob_df)
    else:
        st.warning("Please select a topic and enter some text to analyze.")

st.sidebar.header("About")
st.sidebar.info("""
This sentiment classifier was trained on the 'Twitter Entity Sentiment Analysis' dataset. The notebook demonstrates the use of a Logistic Regression model with the Bag-of-Words text representation technique.

**Key Steps:**
- **Text Cleaning:** Lowercasing, punctuation/stopword removal, and lemmatization.
- **Feature Engineering:**
    - `CountVectorizer` (Bag-of-Words) for tweet text.
    - `TargetEncoder` for the tweet's topic/title.
- **Modeling:** A `LogisticRegression` model is used for classification.
""")