diff --git a/lab-nlp.ipynb b/lab-nlp.ipynb
new file mode 100644
index 0000000..a398e1c
--- /dev/null
+++ b/lab-nlp.ipynb
@@ -0,0 +1,2215 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "0db989b9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import re\n",
+ "import nltk #Natural Language tool kit -- this pacakge is quite a mess. Was poorly design and the documentation is not great\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "from nltk.corpus import stopwords\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "fabf32dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_up(text):\n",
+ " url_pattern = r'https?://\\S+|www\\.\\S+'\n",
+ " symbols_pattern = r'[^a-zA-Z\\s]'\n",
+ "\n",
+ " text_without_urls = re.sub(url_pattern, '', text)\n",
+ " cleaned_text = re.sub(pattern, ' ', text_without_urls)\n",
+ " \n",
+ " return cleaned_text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "9ed3c2d2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def tokenize(text):\n",
+ " from nltk.tokenize import word_tokenize\n",
+ " nltk.download('punkt')\n",
+ " tokens = word_tokenize(text)\n",
+ " return tokens"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0658b876",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def stem_and_lemmatize(text):\n",
+ " from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer\n",
+ " ps = SnowballStemmer(language = 'english')\n",
+ " stemmed = [ps.stem(word) for word in tokens]\n",
+ " nltk.download('wordnet') # wordnet is the most well known lemmatizer for english\n",
+ " from nltk.stem import WordNetLemmatizer\n",
+ " from nltk.corpus import wordnet\n",
+ " nltk.download('omw-1.4')\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]\n",
+ " lemmatized\n",
+ " return lemmatized"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "66af8ddd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remove_stopwords(text):\n",
+ " from nltk.corpus import stopwords\n",
+ " nltk.download('stopwords')\n",
+ " without_sw = [word for word in lemmatized if word not in stopwords.words()]\n",
+ " return without_sw "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8532b34f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to\n",
+ "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n",
+ "[nltk_data] Downloading package wordnet to\n",
+ "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n",
+ "[nltk_data] Downloading package omw-1.4 to\n",
+ "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package omw-1.4 is already up-to-date!\n",
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from nltk.tokenize import word_tokenize\n",
+ "from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "from nltk.corpus import wordnet\n",
+ "from nltk.corpus import stopwords\n",
+ "nltk.download('punkt')\n",
+ "nltk.download('wordnet') # wordnet is the most well known lemmatizer for english\n",
+ "nltk.download('omw-1.4')\n",
+ "nltk.download('stopwords')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "0fc35c50",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#function to cleaning, tokenizing, stemming, lemmatizing and removing stopwords\n",
+ "\n",
+ "import re\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem import SnowballStemmer, WordNetLemmatizer\n",
+ "\n",
+ "# Precompile regular expressions to gain efficiency\n",
+ "url_pattern = re.compile(r'https?://\\S+|www\\.\\S+')\n",
+ "symbols_pattern = re.compile(r'[^a-zA-Z\\s]')\n",
+ "\n",
+ "# Define stopwords as a set\n",
+ "stop_words = set(stopwords.words('english'))\n",
+ "\n",
+ "def tokenize_stem_lemmatize_stopwords(text):\n",
+ " # Remove URLs\n",
+ " text_without_urls = url_pattern.sub('', text)\n",
+ " \n",
+ " # Remove symbols\n",
+ " cleaned_text = symbols_pattern.sub(' ', text_without_urls)\n",
+ " \n",
+ " # Tokenize\n",
+ " tokens = word_tokenize(cleaned_text.lower()) # Tokenize and convert to lowercase\n",
+ " \n",
+ " # Stemming\n",
+ " ps = SnowballStemmer(language='english')\n",
+ " stemmed = [ps.stem(word) for word in tokens]\n",
+ " \n",
+ " # Lemmatization\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]\n",
+ " \n",
+ " # Remove stopwords\n",
+ " without_sw = [word for word in lemmatized if word not in stop_words]\n",
+ " \n",
+ " return without_sw"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "7692dfe5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1467810369 | \n",
+ " Mon Apr 06 22:19:45 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _TheSpecialOne_ | \n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 1467811184 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElleCTF | \n",
+ " my whole body feels itchy and like its on fire | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 1467811193 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Karoli | \n",
+ " @nationwideclass no, it's not behaving at all.... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 1467811372 | \n",
+ " Mon Apr 06 22:20:00 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " joy_wolf | \n",
+ " @Kwesidei not the whole crew | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n",
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
+ "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
+ "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
+ "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
+ "\n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
+ "0 is upset that he can't update his Facebook by ... \n",
+ "1 @Kenichan I dived many times for the ball. Man... \n",
+ "2 my whole body feels itchy and like its on fire \n",
+ "3 @nationwideclass no, it's not behaving at all.... \n",
+ "4 @Kwesidei not the whole crew "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#importing dataset\n",
+ "\n",
+ "tweets = pd.read_csv('tweets.csv', encoding='ISO-8859-1')\n",
+ "tweets.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "4951651e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweets.columns = ['target','id','date','flag','user','text']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "24054805",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 1467811184 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElleCTF | \n",
+ " my whole body feels itchy and like its on fire | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 1467811193 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Karoli | \n",
+ " @nationwideclass no, it's not behaving at all.... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 1467811372 | \n",
+ " Mon Apr 06 22:20:00 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " joy_wolf | \n",
+ " @Kwesidei not the whole crew | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1599994 | \n",
+ " 4 | \n",
+ " 2193601966 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " AmandaMarie1028 | \n",
+ " Just woke up. Having no school is the best fee... | \n",
+ "
\n",
+ " \n",
+ " | 1599995 | \n",
+ " 4 | \n",
+ " 2193601969 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " TheWDBoards | \n",
+ " TheWDB.com - Very cool to hear old Walt interv... | \n",
+ "
\n",
+ " \n",
+ " | 1599996 | \n",
+ " 4 | \n",
+ " 2193601991 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " bpbabe | \n",
+ " Are you ready for your MoJo Makeover? Ask me f... | \n",
+ "
\n",
+ " \n",
+ " | 1599997 | \n",
+ " 4 | \n",
+ " 2193602064 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " tinydiamondz | \n",
+ " Happy 38th Birthday to my boo of alll time!!! ... | \n",
+ "
\n",
+ " \n",
+ " | 1599998 | \n",
+ " 4 | \n",
+ " 2193602129 | \n",
+ " Tue Jun 16 08:40:50 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " RyanTrevMorris | \n",
+ " happy #charitytuesday @theNSPCC @SparksCharity... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1599999 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n",
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n",
+ "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... \n",
+ "1599994 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599995 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599996 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599997 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599998 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \n",
+ "0 scotthamilton is upset that he can't update his Facebook by ... \n",
+ "1 mattycus @Kenichan I dived many times for the ball. Man... \n",
+ "2 ElleCTF my whole body feels itchy and like its on fire \n",
+ "3 Karoli @nationwideclass no, it's not behaving at all.... \n",
+ "4 joy_wolf @Kwesidei not the whole crew \n",
+ "... ... ... \n",
+ "1599994 AmandaMarie1028 Just woke up. Having no school is the best fee... \n",
+ "1599995 TheWDBoards TheWDB.com - Very cool to hear old Walt interv... \n",
+ "1599996 bpbabe Are you ready for your MoJo Makeover? Ask me f... \n",
+ "1599997 tinydiamondz Happy 38th Birthday to my boo of alll time!!! ... \n",
+ "1599998 RyanTrevMorris happy #charitytuesday @theNSPCC @SparksCharity... \n",
+ "\n",
+ "[1599999 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "e9ce3971",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 788923 | \n",
+ " 0 | \n",
+ " 2325257801 | \n",
+ " Thu Jun 25 05:05:48 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " makahiya | \n",
+ " sad and got hurt from a friend's comment | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 91290 | \n",
+ " 0 | \n",
+ " 1759452127 | \n",
+ " Sun May 10 19:14:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " justkarii | \n",
+ " I want chris back :'( how did things get this ... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 841280 | \n",
+ " 4 | \n",
+ " 1562361577 | \n",
+ " Sun Apr 19 19:48:09 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " blueyesblue | \n",
+ " @vnakic dude, i'm surprised you found me | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1530496 | \n",
+ " 4 | \n",
+ " 2177820795 | \n",
+ " Mon Jun 15 06:45:24 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " monnie | \n",
+ " Hey Internet! You know that thing you can get ... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 771592 | \n",
+ " 0 | \n",
+ " 2302441207 | \n",
+ " Tue Jun 23 16:48:46 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " backpocketblues | \n",
+ " @rawralyrawr apparently... "dick rails&qu... | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 19995 | \n",
+ " 331165 | \n",
+ " 0 | \n",
+ " 2012563615 | \n",
+ " Tue Jun 02 20:52:47 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " MizMari | \n",
+ " @seanjissomean dude jk LOL i'll miss ya wing m... | \n",
+ "
\n",
+ " \n",
+ " | 19996 | \n",
+ " 1202127 | \n",
+ " 4 | \n",
+ " 1985803236 | \n",
+ " Sun May 31 17:48:12 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " LexiSunshine | \n",
+ " Hey @x17online zaaaaaaac <333 so now all th... | \n",
+ "
\n",
+ " \n",
+ " | 19997 | \n",
+ " 1063769 | \n",
+ " 4 | \n",
+ " 1964544524 | \n",
+ " Fri May 29 14:24:23 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " aeonbeat | \n",
+ " @faces i was looking for the book two days ago... | \n",
+ "
\n",
+ " \n",
+ " | 19998 | \n",
+ " 13456 | \n",
+ " 0 | \n",
+ " 1553264085 | \n",
+ " Sat Apr 18 13:48:15 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " angx | \n",
+ " http://twitpic.com/3jqma - I wish they fit ma... | \n",
+ "
\n",
+ " \n",
+ " | 19999 | \n",
+ " 1423044 | \n",
+ " 4 | \n",
+ " 2058622571 | \n",
+ " Sat Jun 06 15:18:17 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " sarahbrowntown | \n",
+ " @Shannoncurrie bahaha! you're wonderful | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20000 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index target id date flag \\\n",
+ "0 788923 0 2325257801 Thu Jun 25 05:05:48 PDT 2009 NO_QUERY \n",
+ "1 91290 0 1759452127 Sun May 10 19:14:36 PDT 2009 NO_QUERY \n",
+ "2 841280 4 1562361577 Sun Apr 19 19:48:09 PDT 2009 NO_QUERY \n",
+ "3 1530496 4 2177820795 Mon Jun 15 06:45:24 PDT 2009 NO_QUERY \n",
+ "4 771592 0 2302441207 Tue Jun 23 16:48:46 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... ... \n",
+ "19995 331165 0 2012563615 Tue Jun 02 20:52:47 PDT 2009 NO_QUERY \n",
+ "19996 1202127 4 1985803236 Sun May 31 17:48:12 PDT 2009 NO_QUERY \n",
+ "19997 1063769 4 1964544524 Fri May 29 14:24:23 PDT 2009 NO_QUERY \n",
+ "19998 13456 0 1553264085 Sat Apr 18 13:48:15 PDT 2009 NO_QUERY \n",
+ "19999 1423044 4 2058622571 Sat Jun 06 15:18:17 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \n",
+ "0 makahiya sad and got hurt from a friend's comment \n",
+ "1 justkarii I want chris back :'( how did things get this ... \n",
+ "2 blueyesblue @vnakic dude, i'm surprised you found me \n",
+ "3 monnie Hey Internet! You know that thing you can get ... \n",
+ "4 backpocketblues @rawralyrawr apparently... "dick rails&qu... \n",
+ "... ... ... \n",
+ "19995 MizMari @seanjissomean dude jk LOL i'll miss ya wing m... \n",
+ "19996 LexiSunshine Hey @x17online zaaaaaaac <333 so now all th... \n",
+ "19997 aeonbeat @faces i was looking for the book two days ago... \n",
+ "19998 angx http://twitpic.com/3jqma - I wish they fit ma... \n",
+ "19999 sarahbrowntown @Shannoncurrie bahaha! you're wonderful \n",
+ "\n",
+ "[20000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#creating a sample of 20000\n",
+ "\n",
+ "sampled_tweets = tweets.sample(n=20000)\n",
+ "sampled_tweets = sampled_tweets.reset_index()\n",
+ "sampled_tweets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "c0b00eca",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 788923 | \n",
+ " 0 | \n",
+ " 2325257801 | \n",
+ " Thu Jun 25 05:05:48 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " makahiya | \n",
+ " sad and got hurt from a friend's comment | \n",
+ " [sad, got, hurt, friend, comment] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 91290 | \n",
+ " 0 | \n",
+ " 1759452127 | \n",
+ " Sun May 10 19:14:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " justkarii | \n",
+ " I want chris back :'( how did things get this ... | \n",
+ " [want, chris, back, thing, get, fuck, miss, fr... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 841280 | \n",
+ " 4 | \n",
+ " 1562361577 | \n",
+ " Sun Apr 19 19:48:09 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " blueyesblue | \n",
+ " @vnakic dude, i'm surprised you found me | \n",
+ " [vnakic, dude, surpris, found] | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1530496 | \n",
+ " 4 | \n",
+ " 2177820795 | \n",
+ " Mon Jun 15 06:45:24 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " monnie | \n",
+ " Hey Internet! You know that thing you can get ... | \n",
+ " [hey, internet, know, thing, get, put, memori,... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 771592 | \n",
+ " 0 | \n",
+ " 2302441207 | \n",
+ " Tue Jun 23 16:48:46 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " backpocketblues | \n",
+ " @rawralyrawr apparently... "dick rails&qu... | \n",
+ " [rawralyrawr, appar, quot, dick, rail, quot, g... | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 19995 | \n",
+ " 331165 | \n",
+ " 0 | \n",
+ " 2012563615 | \n",
+ " Tue Jun 02 20:52:47 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " MizMari | \n",
+ " @seanjissomean dude jk LOL i'll miss ya wing m... | \n",
+ " [seanjissomean, dude, jk, lol, miss, ya, wing,... | \n",
+ "
\n",
+ " \n",
+ " | 19996 | \n",
+ " 1202127 | \n",
+ " 4 | \n",
+ " 1985803236 | \n",
+ " Sun May 31 17:48:12 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " LexiSunshine | \n",
+ " Hey @x17online zaaaaaaac <333 so now all th... | \n",
+ " [hey, x, onlin, zaaaaaaac, lt, import, peopl, ... | \n",
+ "
\n",
+ " \n",
+ " | 19997 | \n",
+ " 1063769 | \n",
+ " 4 | \n",
+ " 1964544524 | \n",
+ " Fri May 29 14:24:23 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " aeonbeat | \n",
+ " @faces i was looking for the book two days ago... | \n",
+ " [face, wa, look, book, two, day, ago, burga, f... | \n",
+ "
\n",
+ " \n",
+ " | 19998 | \n",
+ " 13456 | \n",
+ " 0 | \n",
+ " 1553264085 | \n",
+ " Sat Apr 18 13:48:15 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " angx | \n",
+ " http://twitpic.com/3jqma - I wish they fit ma... | \n",
+ " [wish, fit, mayb, stuff, lot] | \n",
+ "
\n",
+ " \n",
+ " | 19999 | \n",
+ " 1423044 | \n",
+ " 4 | \n",
+ " 2058622571 | \n",
+ " Sat Jun 06 15:18:17 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " sarahbrowntown | \n",
+ " @Shannoncurrie bahaha! you're wonderful | \n",
+ " [shannoncurri, bahaha, wonder] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20000 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index target id date flag \\\n",
+ "0 788923 0 2325257801 Thu Jun 25 05:05:48 PDT 2009 NO_QUERY \n",
+ "1 91290 0 1759452127 Sun May 10 19:14:36 PDT 2009 NO_QUERY \n",
+ "2 841280 4 1562361577 Sun Apr 19 19:48:09 PDT 2009 NO_QUERY \n",
+ "3 1530496 4 2177820795 Mon Jun 15 06:45:24 PDT 2009 NO_QUERY \n",
+ "4 771592 0 2302441207 Tue Jun 23 16:48:46 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... ... \n",
+ "19995 331165 0 2012563615 Tue Jun 02 20:52:47 PDT 2009 NO_QUERY \n",
+ "19996 1202127 4 1985803236 Sun May 31 17:48:12 PDT 2009 NO_QUERY \n",
+ "19997 1063769 4 1964544524 Fri May 29 14:24:23 PDT 2009 NO_QUERY \n",
+ "19998 13456 0 1553264085 Sat Apr 18 13:48:15 PDT 2009 NO_QUERY \n",
+ "19999 1423044 4 2058622571 Sat Jun 06 15:18:17 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "0 makahiya sad and got hurt from a friend's comment \n",
+ "1 justkarii I want chris back :'( how did things get this ... \n",
+ "2 blueyesblue @vnakic dude, i'm surprised you found me \n",
+ "3 monnie Hey Internet! You know that thing you can get ... \n",
+ "4 backpocketblues @rawralyrawr apparently... "dick rails&qu... \n",
+ "... ... ... \n",
+ "19995 MizMari @seanjissomean dude jk LOL i'll miss ya wing m... \n",
+ "19996 LexiSunshine Hey @x17online zaaaaaaac <333 so now all th... \n",
+ "19997 aeonbeat @faces i was looking for the book two days ago... \n",
+ "19998 angx http://twitpic.com/3jqma - I wish they fit ma... \n",
+ "19999 sarahbrowntown @Shannoncurrie bahaha! you're wonderful \n",
+ "\n",
+ " text_processed \n",
+ "0 [sad, got, hurt, friend, comment] \n",
+ "1 [want, chris, back, thing, get, fuck, miss, fr... \n",
+ "2 [vnakic, dude, surpris, found] \n",
+ "3 [hey, internet, know, thing, get, put, memori,... \n",
+ "4 [rawralyrawr, appar, quot, dick, rail, quot, g... \n",
+ "... ... \n",
+ "19995 [seanjissomean, dude, jk, lol, miss, ya, wing,... \n",
+ "19996 [hey, x, onlin, zaaaaaaac, lt, import, peopl, ... \n",
+ "19997 [face, wa, look, book, two, day, ago, burga, f... \n",
+ "19998 [wish, fit, mayb, stuff, lot] \n",
+ "19999 [shannoncurri, bahaha, wonder] \n",
+ "\n",
+ "[20000 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#applying previous function to the sample\n",
+ "\n",
+ "sampled_tweets['text_processed'] = sampled_tweets['text'].apply(tokenize_stem_lemmatize_stopwords)\n",
+ "sampled_tweets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "9f9caf6a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 788923 | \n",
+ " 0 | \n",
+ " 2325257801 | \n",
+ " Thu Jun 25 05:05:48 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " makahiya | \n",
+ " sad and got hurt from a friend's comment | \n",
+ " sad got hurt friend comment | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 91290 | \n",
+ " 0 | \n",
+ " 1759452127 | \n",
+ " Sun May 10 19:14:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " justkarii | \n",
+ " I want chris back :'( how did things get this ... | \n",
+ " want chris back thing get fuck miss friend | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 841280 | \n",
+ " 4 | \n",
+ " 1562361577 | \n",
+ " Sun Apr 19 19:48:09 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " blueyesblue | \n",
+ " @vnakic dude, i'm surprised you found me | \n",
+ " vnakic dude surpris found | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1530496 | \n",
+ " 4 | \n",
+ " 2177820795 | \n",
+ " Mon Jun 15 06:45:24 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " monnie | \n",
+ " Hey Internet! You know that thing you can get ... | \n",
+ " hey internet know thing get put memori card pu... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 771592 | \n",
+ " 0 | \n",
+ " 2302441207 | \n",
+ " Tue Jun 23 16:48:46 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " backpocketblues | \n",
+ " @rawralyrawr apparently... "dick rails&qu... | \n",
+ " rawralyrawr appar quot dick rail quot girl two... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index target id date flag \\\n",
+ "0 788923 0 2325257801 Thu Jun 25 05:05:48 PDT 2009 NO_QUERY \n",
+ "1 91290 0 1759452127 Sun May 10 19:14:36 PDT 2009 NO_QUERY \n",
+ "2 841280 4 1562361577 Sun Apr 19 19:48:09 PDT 2009 NO_QUERY \n",
+ "3 1530496 4 2177820795 Mon Jun 15 06:45:24 PDT 2009 NO_QUERY \n",
+ "4 771592 0 2302441207 Tue Jun 23 16:48:46 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "0 makahiya sad and got hurt from a friend's comment \n",
+ "1 justkarii I want chris back :'( how did things get this ... \n",
+ "2 blueyesblue @vnakic dude, i'm surprised you found me \n",
+ "3 monnie Hey Internet! You know that thing you can get ... \n",
+ "4 backpocketblues @rawralyrawr apparently... "dick rails&qu... \n",
+ "\n",
+ " text_processed \n",
+ "0 sad got hurt friend comment \n",
+ "1 want chris back thing get fuck miss friend \n",
+ "2 vnakic dude surpris found \n",
+ "3 hey internet know thing get put memori card pu... \n",
+ "4 rawralyrawr appar quot dick rail quot girl two... "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def re_blob(row):\n",
+ " return \" \".join(row['text_processed'])\n",
+ "\n",
+ "sampled_tweets['text_processed'] = sampled_tweets.apply(re_blob,axis=1)\n",
+ "sampled_tweets.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 116,
+ "id": "06832ee5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ " clean_blob | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 442050 | \n",
+ " 0 | \n",
+ " 2067173325 | \n",
+ " Sun Jun 07 11:52:50 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " rike90 | \n",
+ " woooohoo!? what the hell... it's cold | \n",
+ " [woooohoo, hell, cold] | \n",
+ " woooohoo hell cold | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 535756 | \n",
+ " 0 | \n",
+ " 2198105892 | \n",
+ " Tue Jun 16 15:36:42 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Robbie_Taylor | \n",
+ " where those *& ^% Iphone 3.0 update remain... | \n",
+ " [amp, iphon, updat, remain, allreadi, midnight] | \n",
+ " amp iphon updat remain allreadi midnight | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 631121 | \n",
+ " 0 | \n",
+ " 2232222174 | \n",
+ " Thu Jun 18 19:37:09 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " GinaXP | \n",
+ " i just experienced the 1st time of falling asl... | \n",
+ " [experienc, st, time, fall, asleep, wake, scre... | \n",
+ " experienc st time fall asleep wake scream beca... | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 191761 | \n",
+ " 0 | \n",
+ " 1969708462 | \n",
+ " Sat May 30 00:56:40 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Amy_E_W | \n",
+ " Getting glasses today | \n",
+ " [get, glass, today] | \n",
+ " get glass today | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 681963 | \n",
+ " 0 | \n",
+ " 2249873888 | \n",
+ " Fri Jun 19 23:11:58 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " RacheLyn5485 | \n",
+ " @boysforpele32 Awww....that's sad | \n",
+ " [boysforpel, sad] | \n",
+ " boysforpel sad | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 19990 | \n",
+ " 347498 | \n",
+ " 0 | \n",
+ " 2016485849 | \n",
+ " Wed Jun 03 06:51:39 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " naomijlea | \n",
+ " @Rufus_Jay oww I thought it might be your gran... | \n",
+ " [rufus, jay, oww, thought, might, grandad, som... | \n",
+ " rufus jay oww thought might grandad someth oh ... | \n",
+ "
\n",
+ " \n",
+ " | 19991 | \n",
+ " 547055 | \n",
+ " 0 | \n",
+ " 2201959167 | \n",
+ " Tue Jun 16 21:01:39 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " eponaproblemos | \n",
+ " Is super grossed out by the huge pimple! | \n",
+ " [super, gross, huge, pimpl] | \n",
+ " super gross huge pimpl | \n",
+ "
\n",
+ " \n",
+ " | 19996 | \n",
+ " 32191 | \n",
+ " 0 | \n",
+ " 1564195769 | \n",
+ " Mon Apr 20 02:15:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " roban09 | \n",
+ " Going to bed... did not finish MC | \n",
+ " [go, bed, finish, mc] | \n",
+ " go bed finish mc | \n",
+ "
\n",
+ " \n",
+ " | 19998 | \n",
+ " 771447 | \n",
+ " 0 | \n",
+ " 2302366040 | \n",
+ " Tue Jun 23 16:42:25 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " jlbbb143 | \n",
+ " @Bob_Roony yup | \n",
+ " [bob, rooni, yup] | \n",
+ " bob rooni yup | \n",
+ "
\n",
+ " \n",
+ " | 19999 | \n",
+ " 165388 | \n",
+ " 0 | \n",
+ " 1960810082 | \n",
+ " Fri May 29 08:32:21 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " cbond007 | \n",
+ " I don't think I'm very well. Just in time for ... | \n",
+ " [think, veri, well, time, weekend] | \n",
+ " think veri well time weekend | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
9992 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index target id date flag \\\n",
+ "0 442050 0 2067173325 Sun Jun 07 11:52:50 PDT 2009 NO_QUERY \n",
+ "3 535756 0 2198105892 Tue Jun 16 15:36:42 PDT 2009 NO_QUERY \n",
+ "7 631121 0 2232222174 Thu Jun 18 19:37:09 PDT 2009 NO_QUERY \n",
+ "9 191761 0 1969708462 Sat May 30 00:56:40 PDT 2009 NO_QUERY \n",
+ "12 681963 0 2249873888 Fri Jun 19 23:11:58 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... ... \n",
+ "19990 347498 0 2016485849 Wed Jun 03 06:51:39 PDT 2009 NO_QUERY \n",
+ "19991 547055 0 2201959167 Tue Jun 16 21:01:39 PDT 2009 NO_QUERY \n",
+ "19996 32191 0 1564195769 Mon Apr 20 02:15:53 PDT 2009 NO_QUERY \n",
+ "19998 771447 0 2302366040 Tue Jun 23 16:42:25 PDT 2009 NO_QUERY \n",
+ "19999 165388 0 1960810082 Fri May 29 08:32:21 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "0 rike90 woooohoo!? what the hell... it's cold \n",
+ "3 Robbie_Taylor where those *& ^% Iphone 3.0 update remain... \n",
+ "7 GinaXP i just experienced the 1st time of falling asl... \n",
+ "9 Amy_E_W Getting glasses today \n",
+ "12 RacheLyn5485 @boysforpele32 Awww....that's sad \n",
+ "... ... ... \n",
+ "19990 naomijlea @Rufus_Jay oww I thought it might be your gran... \n",
+ "19991 eponaproblemos Is super grossed out by the huge pimple! \n",
+ "19996 roban09 Going to bed... did not finish MC \n",
+ "19998 jlbbb143 @Bob_Roony yup \n",
+ "19999 cbond007 I don't think I'm very well. Just in time for ... \n",
+ "\n",
+ " text_processed \\\n",
+ "0 [woooohoo, hell, cold] \n",
+ "3 [amp, iphon, updat, remain, allreadi, midnight] \n",
+ "7 [experienc, st, time, fall, asleep, wake, scre... \n",
+ "9 [get, glass, today] \n",
+ "12 [boysforpel, sad] \n",
+ "... ... \n",
+ "19990 [rufus, jay, oww, thought, might, grandad, som... \n",
+ "19991 [super, gross, huge, pimpl] \n",
+ "19996 [go, bed, finish, mc] \n",
+ "19998 [bob, rooni, yup] \n",
+ "19999 [think, veri, well, time, weekend] \n",
+ "\n",
+ " clean_blob \n",
+ "0 woooohoo hell cold \n",
+ "3 amp iphon updat remain allreadi midnight \n",
+ "7 experienc st time fall asleep wake scream beca... \n",
+ "9 get glass today \n",
+ "12 boysforpel sad \n",
+ "... ... \n",
+ "19990 rufus jay oww thought might grandad someth oh ... \n",
+ "19991 super gross huge pimpl \n",
+ "19996 go bed finish mc \n",
+ "19998 bob rooni yup \n",
+ "19999 think veri well time weekend \n",
+ "\n",
+ "[9992 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 116,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "neg_tweets = sampled_tweets[sampled_tweets['target']==0]\n",
+ "neg_tweets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "id": "8f37ca5f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ " clean_blob | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " 1561904 | \n",
+ " 4 | \n",
+ " 2186697271 | \n",
+ " Mon Jun 15 19:12:26 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " jmillz1214 | \n",
+ " @ANGELEYESBABYFA hey sexy how u doin | \n",
+ " [angeleyesbabyfa, hey, sexi, u, doin] | \n",
+ " angeleyesbabyfa hey sexi u doin | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1486611 | \n",
+ " 4 | \n",
+ " 2068115601 | \n",
+ " Sun Jun 07 13:31:30 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " LaChiinalinda | \n",
+ " @Yardydp81 Thankyou | \n",
+ " [yardydp, thankyou] | \n",
+ " yardydp thankyou | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 869136 | \n",
+ " 4 | \n",
+ " 1678071815 | \n",
+ " Sat May 02 05:22:25 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " HayliieeXo | \n",
+ " Listening to TMF music && browsing on ... | \n",
+ " [listen, tmf, music, amp, amp, brow, internet] | \n",
+ " listen tmf music amp amp brow internet | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1587788 | \n",
+ " 4 | \n",
+ " 2190992146 | \n",
+ " Tue Jun 16 04:15:33 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " RoryBecker | \n",
+ " @Ben_Hall Nothing I can talk about | \n",
+ " [ben, hall, noth, talk] | \n",
+ " ben hall noth talk | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 1149394 | \n",
+ " 4 | \n",
+ " 1978335050 | \n",
+ " Sat May 30 23:03:24 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " enamoredsoul | \n",
+ " @Owais_Iqbal like what songs!?!? please do sha... | \n",
+ " [owai, iqbal, like, song, plea, share] | \n",
+ " owai iqbal like song plea share | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 19992 | \n",
+ " 802696 | \n",
+ " 4 | \n",
+ " 1468268777 | \n",
+ " Tue Apr 07 00:39:04 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " answersonly | \n",
+ " Meatloaf- I love it! | \n",
+ " [meatloaf, love] | \n",
+ " meatloaf love | \n",
+ "
\n",
+ " \n",
+ " | 19993 | \n",
+ " 1346089 | \n",
+ " 4 | \n",
+ " 2044194612 | \n",
+ " Fri Jun 05 09:00:25 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Jennifuuurrr | \n",
+ " at home | \n",
+ " [home] | \n",
+ " home | \n",
+ "
\n",
+ " \n",
+ " | 19994 | \n",
+ " 1040824 | \n",
+ " 4 | \n",
+ " 1957002408 | \n",
+ " Thu May 28 23:14:55 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " jjbaby85 | \n",
+ " BrB...about to eat, drink and dance | \n",
+ " [brb, eat, drink, danc] | \n",
+ " brb eat drink danc | \n",
+ "
\n",
+ " \n",
+ " | 19995 | \n",
+ " 1008316 | \n",
+ " 4 | \n",
+ " 1880790228 | \n",
+ " Fri May 22 02:30:52 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " bletheringfool | \n",
+ " @Gailporter good news all round | \n",
+ " [gailport, good, news, round] | \n",
+ " gailport good news round | \n",
+ "
\n",
+ " \n",
+ " | 19997 | \n",
+ " 1115084 | \n",
+ " 4 | \n",
+ " 1972898225 | \n",
+ " Sat May 30 10:05:37 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " autumnrobin | \n",
+ " I woke up about an hour ago. But still laying ... | \n",
+ " [woke, hour, ago, still, lay, bed, got, ta, nail] | \n",
+ " woke hour ago still lay bed got ta nail | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10008 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index target id date flag \\\n",
+ "1 1561904 4 2186697271 Mon Jun 15 19:12:26 PDT 2009 NO_QUERY \n",
+ "2 1486611 4 2068115601 Sun Jun 07 13:31:30 PDT 2009 NO_QUERY \n",
+ "4 869136 4 1678071815 Sat May 02 05:22:25 PDT 2009 NO_QUERY \n",
+ "5 1587788 4 2190992146 Tue Jun 16 04:15:33 PDT 2009 NO_QUERY \n",
+ "6 1149394 4 1978335050 Sat May 30 23:03:24 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... ... \n",
+ "19992 802696 4 1468268777 Tue Apr 07 00:39:04 PDT 2009 NO_QUERY \n",
+ "19993 1346089 4 2044194612 Fri Jun 05 09:00:25 PDT 2009 NO_QUERY \n",
+ "19994 1040824 4 1957002408 Thu May 28 23:14:55 PDT 2009 NO_QUERY \n",
+ "19995 1008316 4 1880790228 Fri May 22 02:30:52 PDT 2009 NO_QUERY \n",
+ "19997 1115084 4 1972898225 Sat May 30 10:05:37 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "1 jmillz1214 @ANGELEYESBABYFA hey sexy how u doin \n",
+ "2 LaChiinalinda @Yardydp81 Thankyou \n",
+ "4 HayliieeXo Listening to TMF music && browsing on ... \n",
+ "5 RoryBecker @Ben_Hall Nothing I can talk about \n",
+ "6 enamoredsoul @Owais_Iqbal like what songs!?!? please do sha... \n",
+ "... ... ... \n",
+ "19992 answersonly Meatloaf- I love it! \n",
+ "19993 Jennifuuurrr at home \n",
+ "19994 jjbaby85 BrB...about to eat, drink and dance \n",
+ "19995 bletheringfool @Gailporter good news all round \n",
+ "19997 autumnrobin I woke up about an hour ago. But still laying ... \n",
+ "\n",
+ " text_processed \\\n",
+ "1 [angeleyesbabyfa, hey, sexi, u, doin] \n",
+ "2 [yardydp, thankyou] \n",
+ "4 [listen, tmf, music, amp, amp, brow, internet] \n",
+ "5 [ben, hall, noth, talk] \n",
+ "6 [owai, iqbal, like, song, plea, share] \n",
+ "... ... \n",
+ "19992 [meatloaf, love] \n",
+ "19993 [home] \n",
+ "19994 [brb, eat, drink, danc] \n",
+ "19995 [gailport, good, news, round] \n",
+ "19997 [woke, hour, ago, still, lay, bed, got, ta, nail] \n",
+ "\n",
+ " clean_blob \n",
+ "1 angeleyesbabyfa hey sexi u doin \n",
+ "2 yardydp thankyou \n",
+ "4 listen tmf music amp amp brow internet \n",
+ "5 ben hall noth talk \n",
+ "6 owai iqbal like song plea share \n",
+ "... ... \n",
+ "19992 meatloaf love \n",
+ "19993 home \n",
+ "19994 brb eat drink danc \n",
+ "19995 gailport good news round \n",
+ "19997 woke hour ago still lay bed got ta nail \n",
+ "\n",
+ "[10008 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 117,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pos_tweets = sampled_tweets[sampled_tweets['target']==4]\n",
+ "pos_tweets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "b9d8640f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#let's take only the most common 1000 words\n",
+ "bow_vect = CountVectorizer(max_features=1000)\n",
+ "# fit creates one entry for each different word seen\n",
+ "X = bow_vect.fit_transform(sampled_tweets['text_processed']).toarray()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "43ca94ab",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " abl | \n",
+ " absolut | \n",
+ " account | \n",
+ " ach | \n",
+ " act | \n",
+ " actual | \n",
+ " ad | \n",
+ " add | \n",
+ " addict | \n",
+ " ador | \n",
+ " ... | \n",
+ " yeah | \n",
+ " year | \n",
+ " yep | \n",
+ " yes | \n",
+ " yesterday | \n",
+ " yet | \n",
+ " yo | \n",
+ " youtub | \n",
+ " yr | \n",
+ " yup | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 1000 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " abl absolut account ach act actual ad add addict ador ... yeah \\\n",
+ "0 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "1 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "2 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "3 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "4 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "6 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "7 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "8 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "9 0 0 0 0 0 0 0 0 0 0 ... 0 \n",
+ "\n",
+ " year yep yes yesterday yet yo youtub yr yup \n",
+ "0 0 0 0 0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 0 0 0 0 \n",
+ "5 0 0 0 0 0 0 0 0 0 \n",
+ "6 0 0 0 0 0 0 0 0 0 \n",
+ "7 0 0 0 0 0 0 0 0 0 \n",
+ "8 0 0 0 0 0 0 0 0 0 \n",
+ "9 0 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ "[10 rows x 1000 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "as_df = pd.DataFrame(X,columns=bow_vect.get_feature_names_out())\n",
+ "as_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "87456242",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dividing tweets into two clusters\n",
+ "\n",
+ "from sklearn.cluster import KMeans\n",
+ "kmeans = KMeans(n_clusters=2, random_state=0)\n",
+ "kmeans.fit(X)\n",
+ "pred = kmeans.predict(X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "ee86fd48",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " sad and got hurt from a friend's comment | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " I want chris back :'( how did things get this ... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " @vnakic dude, i'm surprised you found me | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Hey Internet! You know that thing you can get ... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " @rawralyrawr apparently... "dick rails&qu... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text class\n",
+ "0 sad and got hurt from a friend's comment 1\n",
+ "1 I want chris back :'( how did things get this ... 1\n",
+ "2 @vnakic dude, i'm surprised you found me 1\n",
+ "3 Hey Internet! You know that thing you can get ... 1\n",
+ "4 @rawralyrawr apparently... "dick rails&qu... 1"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "predict_df = pd.concat([sampled_tweets['text'],pd.DataFrame(pred,columns=['class'])],axis=1)\n",
+ "predict_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "d69dde11",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 8 | \n",
+ " hey tweets whats going on this early morn cant... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " Going back to bed after a stressful match | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " Nearly home about 2 and a half hours left to go | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " @SdVintageVixen Awwwww! if you guys wanna com... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " @DrLisaTurner I was going to 'attend' but find... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 19913 | \n",
+ " 3rd was soooooo NOT the way to go...ughhhh fee... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 19932 | \n",
+ " Dinner first, going home soon, then to Island ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 19933 | \n",
+ " Titus was ahhh-some... now I am going to bed | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 19963 | \n",
+ " is playing green day as loud as it will go | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 19981 | \n",
+ " Feeling down today, miss my boo and kinda wann... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1657 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text class\n",
+ "8 hey tweets whats going on this early morn cant... 0\n",
+ "17 Going back to bed after a stressful match 0\n",
+ "33 Nearly home about 2 and a half hours left to go 0\n",
+ "37 @SdVintageVixen Awwwww! if you guys wanna com... 0\n",
+ "55 @DrLisaTurner I was going to 'attend' but find... 0\n",
+ "... ... ...\n",
+ "19913 3rd was soooooo NOT the way to go...ughhhh fee... 0\n",
+ "19932 Dinner first, going home soon, then to Island ... 0\n",
+ "19933 Titus was ahhh-some... now I am going to bed 0\n",
+ "19963 is playing green day as loud as it will go 0\n",
+ "19981 Feeling down today, miss my boo and kinda wann... 0\n",
+ "\n",
+ "[1657 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#cluster 0\n",
+ "\n",
+ "predict_df[predict_df['class'] == 0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "4606b22f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " sad and got hurt from a friend's comment | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " I want chris back :'( how did things get this ... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " @vnakic dude, i'm surprised you found me | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Hey Internet! You know that thing you can get ... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " @rawralyrawr apparently... "dick rails&qu... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 19995 | \n",
+ " @seanjissomean dude jk LOL i'll miss ya wing m... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 19996 | \n",
+ " Hey @x17online zaaaaaaac <333 so now all th... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 19997 | \n",
+ " @faces i was looking for the book two days ago... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 19998 | \n",
+ " http://twitpic.com/3jqma - I wish they fit ma... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 19999 | \n",
+ " @Shannoncurrie bahaha! you're wonderful | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
18343 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text class\n",
+ "0 sad and got hurt from a friend's comment 1\n",
+ "1 I want chris back :'( how did things get this ... 1\n",
+ "2 @vnakic dude, i'm surprised you found me 1\n",
+ "3 Hey Internet! You know that thing you can get ... 1\n",
+ "4 @rawralyrawr apparently... "dick rails&qu... 1\n",
+ "... ... ...\n",
+ "19995 @seanjissomean dude jk LOL i'll miss ya wing m... 1\n",
+ "19996 Hey @x17online zaaaaaaac <333 so now all th... 1\n",
+ "19997 @faces i was looking for the book two days ago... 1\n",
+ "19998 http://twitpic.com/3jqma - I wish they fit ma... 1\n",
+ "19999 @Shannoncurrie bahaha! you're wonderful 1\n",
+ "\n",
+ "[18343 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#cluster 1\n",
+ "\n",
+ "predict_df[predict_df['class'] == 1]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}