diff --git a/lab-nlp.ipynb b/lab-nlp.ipynb new file mode 100644 index 0000000..a398e1c --- /dev/null +++ b/lab-nlp.ipynb @@ -0,0 +1,2215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0db989b9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import nltk #Natural Language tool kit -- this pacakge is quite a mess. Was poorly design and the documentation is not great\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import stopwords\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fabf32dc", + "metadata": {}, + "outputs": [], + "source": [ + "def clean_up(text):\n", + " url_pattern = r'https?://\\S+|www\\.\\S+'\n", + " symbols_pattern = r'[^a-zA-Z\\s]'\n", + "\n", + " text_without_urls = re.sub(url_pattern, '', text)\n", + " cleaned_text = re.sub(pattern, ' ', text_without_urls)\n", + " \n", + " return cleaned_text" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9ed3c2d2", + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(text):\n", + " from nltk.tokenize import word_tokenize\n", + " nltk.download('punkt')\n", + " tokens = word_tokenize(text)\n", + " return tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0658b876", + "metadata": {}, + "outputs": [], + "source": [ + "def stem_and_lemmatize(text):\n", + " from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer\n", + " ps = SnowballStemmer(language = 'english')\n", + " stemmed = [ps.stem(word) for word in tokens]\n", + " nltk.download('wordnet') # wordnet is the most well known lemmatizer for english\n", + " from nltk.stem import WordNetLemmatizer\n", + " from nltk.corpus import wordnet\n", + " nltk.download('omw-1.4')\n", + " lemmatizer = WordNetLemmatizer()\n", + " lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]\n", + " lemmatized\n", + " return lemmatized" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "66af8ddd", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_stopwords(text):\n", + " from nltk.corpus import stopwords\n", + " nltk.download('stopwords')\n", + " without_sw = [word for word in lemmatized if word not in stopwords.words()]\n", + " return without_sw " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8532b34f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package omw-1.4 to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package omw-1.4 is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import wordnet\n", + "from nltk.corpus import stopwords\n", + "nltk.download('punkt')\n", + "nltk.download('wordnet') # wordnet is the most well known lemmatizer for english\n", + "nltk.download('omw-1.4')\n", + "nltk.download('stopwords')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0fc35c50", + "metadata": {}, + "outputs": [], + "source": [ + "#function to cleaning, tokenizing, stemming, lemmatizing and removing stopwords\n", + "\n", + "import re\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import SnowballStemmer, WordNetLemmatizer\n", + "\n", + "# Precompile regular expressions to gain efficiency\n", + "url_pattern = re.compile(r'https?://\\S+|www\\.\\S+')\n", + "symbols_pattern = re.compile(r'[^a-zA-Z\\s]')\n", + "\n", + "# Define stopwords as a set\n", + "stop_words = set(stopwords.words('english'))\n", + "\n", + "def tokenize_stem_lemmatize_stopwords(text):\n", + " # Remove URLs\n", + " text_without_urls = url_pattern.sub('', text)\n", + " \n", + " # Remove symbols\n", + " cleaned_text = symbols_pattern.sub(' ', text_without_urls)\n", + " \n", + " # Tokenize\n", + " tokens = word_tokenize(cleaned_text.lower()) # Tokenize and convert to lowercase\n", + " \n", + " # Stemming\n", + " ps = SnowballStemmer(language='english')\n", + " stemmed = [ps.stem(word) for word in tokens]\n", + " \n", + " # Lemmatization\n", + " lemmatizer = WordNetLemmatizer()\n", + " lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]\n", + " \n", + " # Remove stopwords\n", + " without_sw = [word for word in lemmatized if word not in stop_words]\n", + " \n", + " return without_sw" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7692dfe5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
\n", + "
" + ], + "text/plain": [ + " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n", + "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", + "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", + "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n", + "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n", + "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n", + "\n", + " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n", + "0 is upset that he can't update his Facebook by ... \n", + "1 @Kenichan I dived many times for the ball. Man... \n", + "2 my whole body feels itchy and like its on fire \n", + "3 @nationwideclass no, it's not behaving at all.... \n", + "4 @Kwesidei not the whole crew " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#importing dataset\n", + "\n", + "tweets = pd.read_csv('tweets.csv', encoding='ISO-8859-1')\n", + "tweets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4951651e", + "metadata": {}, + "outputs": [], + "source": [ + "tweets.columns = ['target','id','date','flag','user','text']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "24054805", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertext
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
.....................
159999442193601966Tue Jun 16 08:40:49 PDT 2009NO_QUERYAmandaMarie1028Just woke up. Having no school is the best fee...
159999542193601969Tue Jun 16 08:40:49 PDT 2009NO_QUERYTheWDBoardsTheWDB.com - Very cool to hear old Walt interv...
159999642193601991Tue Jun 16 08:40:49 PDT 2009NO_QUERYbpbabeAre you ready for your MoJo Makeover? Ask me f...
159999742193602064Tue Jun 16 08:40:49 PDT 2009NO_QUERYtinydiamondzHappy 38th Birthday to my boo of alll time!!! ...
159999842193602129Tue Jun 16 08:40:50 PDT 2009NO_QUERYRyanTrevMorrishappy #charitytuesday @theNSPCC @SparksCharity...
\n", + "

1599999 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", + "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", + "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "1599994 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", + "1599995 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", + "1599996 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", + "1599997 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n", + "1599998 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY \n", + "\n", + " user text \n", + "0 scotthamilton is upset that he can't update his Facebook by ... \n", + "1 mattycus @Kenichan I dived many times for the ball. Man... \n", + "2 ElleCTF my whole body feels itchy and like its on fire \n", + "3 Karoli @nationwideclass no, it's not behaving at all.... \n", + "4 joy_wolf @Kwesidei not the whole crew \n", + "... ... ... \n", + "1599994 AmandaMarie1028 Just woke up. Having no school is the best fee... \n", + "1599995 TheWDBoards TheWDB.com - Very cool to hear old Walt interv... \n", + "1599996 bpbabe Are you ready for your MoJo Makeover? Ask me f... \n", + "1599997 tinydiamondz Happy 38th Birthday to my boo of alll time!!! ... \n", + "1599998 RyanTrevMorris happy #charitytuesday @theNSPCC @SparksCharity... \n", + "\n", + "[1599999 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e9ce3971", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextargetiddateflagusertext
078892302325257801Thu Jun 25 05:05:48 PDT 2009NO_QUERYmakahiyasad and got hurt from a friend's comment
19129001759452127Sun May 10 19:14:36 PDT 2009NO_QUERYjustkariiI want chris back :'( how did things get this ...
284128041562361577Sun Apr 19 19:48:09 PDT 2009NO_QUERYblueyesblue@vnakic dude, i'm surprised you found me
3153049642177820795Mon Jun 15 06:45:24 PDT 2009NO_QUERYmonnieHey Internet! You know that thing you can get ...
477159202302441207Tue Jun 23 16:48:46 PDT 2009NO_QUERYbackpocketblues@rawralyrawr apparently... "dick rails&qu...
........................
1999533116502012563615Tue Jun 02 20:52:47 PDT 2009NO_QUERYMizMari@seanjissomean dude jk LOL i'll miss ya wing m...
19996120212741985803236Sun May 31 17:48:12 PDT 2009NO_QUERYLexiSunshineHey @x17online zaaaaaaac <333 so now all th...
19997106376941964544524Fri May 29 14:24:23 PDT 2009NO_QUERYaeonbeat@faces i was looking for the book two days ago...
199981345601553264085Sat Apr 18 13:48:15 PDT 2009NO_QUERYangxhttp://twitpic.com/3jqma - I wish they fit ma...
19999142304442058622571Sat Jun 06 15:18:17 PDT 2009NO_QUERYsarahbrowntown@Shannoncurrie bahaha! you're wonderful
\n", + "

20000 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " index target id date flag \\\n", + "0 788923 0 2325257801 Thu Jun 25 05:05:48 PDT 2009 NO_QUERY \n", + "1 91290 0 1759452127 Sun May 10 19:14:36 PDT 2009 NO_QUERY \n", + "2 841280 4 1562361577 Sun Apr 19 19:48:09 PDT 2009 NO_QUERY \n", + "3 1530496 4 2177820795 Mon Jun 15 06:45:24 PDT 2009 NO_QUERY \n", + "4 771592 0 2302441207 Tue Jun 23 16:48:46 PDT 2009 NO_QUERY \n", + "... ... ... ... ... ... \n", + "19995 331165 0 2012563615 Tue Jun 02 20:52:47 PDT 2009 NO_QUERY \n", + "19996 1202127 4 1985803236 Sun May 31 17:48:12 PDT 2009 NO_QUERY \n", + "19997 1063769 4 1964544524 Fri May 29 14:24:23 PDT 2009 NO_QUERY \n", + "19998 13456 0 1553264085 Sat Apr 18 13:48:15 PDT 2009 NO_QUERY \n", + "19999 1423044 4 2058622571 Sat Jun 06 15:18:17 PDT 2009 NO_QUERY \n", + "\n", + " user text \n", + "0 makahiya sad and got hurt from a friend's comment \n", + "1 justkarii I want chris back :'( how did things get this ... \n", + "2 blueyesblue @vnakic dude, i'm surprised you found me \n", + "3 monnie Hey Internet! You know that thing you can get ... \n", + "4 backpocketblues @rawralyrawr apparently... "dick rails&qu... \n", + "... ... ... \n", + "19995 MizMari @seanjissomean dude jk LOL i'll miss ya wing m... \n", + "19996 LexiSunshine Hey @x17online zaaaaaaac <333 so now all th... \n", + "19997 aeonbeat @faces i was looking for the book two days ago... \n", + "19998 angx http://twitpic.com/3jqma - I wish they fit ma... \n", + "19999 sarahbrowntown @Shannoncurrie bahaha! you're wonderful \n", + "\n", + "[20000 rows x 7 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#creating a sample of 20000\n", + "\n", + "sampled_tweets = tweets.sample(n=20000)\n", + "sampled_tweets = sampled_tweets.reset_index()\n", + "sampled_tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c0b00eca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextargetiddateflagusertexttext_processed
078892302325257801Thu Jun 25 05:05:48 PDT 2009NO_QUERYmakahiyasad and got hurt from a friend's comment[sad, got, hurt, friend, comment]
19129001759452127Sun May 10 19:14:36 PDT 2009NO_QUERYjustkariiI want chris back :'( how did things get this ...[want, chris, back, thing, get, fuck, miss, fr...
284128041562361577Sun Apr 19 19:48:09 PDT 2009NO_QUERYblueyesblue@vnakic dude, i'm surprised you found me[vnakic, dude, surpris, found]
3153049642177820795Mon Jun 15 06:45:24 PDT 2009NO_QUERYmonnieHey Internet! You know that thing you can get ...[hey, internet, know, thing, get, put, memori,...
477159202302441207Tue Jun 23 16:48:46 PDT 2009NO_QUERYbackpocketblues@rawralyrawr apparently... &quot;dick rails&qu...[rawralyrawr, appar, quot, dick, rail, quot, g...
...........................
1999533116502012563615Tue Jun 02 20:52:47 PDT 2009NO_QUERYMizMari@seanjissomean dude jk LOL i'll miss ya wing m...[seanjissomean, dude, jk, lol, miss, ya, wing,...
19996120212741985803236Sun May 31 17:48:12 PDT 2009NO_QUERYLexiSunshineHey @x17online zaaaaaaac &lt;333 so now all th...[hey, x, onlin, zaaaaaaac, lt, import, peopl, ...
19997106376941964544524Fri May 29 14:24:23 PDT 2009NO_QUERYaeonbeat@faces i was looking for the book two days ago...[face, wa, look, book, two, day, ago, burga, f...
199981345601553264085Sat Apr 18 13:48:15 PDT 2009NO_QUERYangxhttp://twitpic.com/3jqma - I wish they fit ma...[wish, fit, mayb, stuff, lot]
19999142304442058622571Sat Jun 06 15:18:17 PDT 2009NO_QUERYsarahbrowntown@Shannoncurrie bahaha! you're wonderful[shannoncurri, bahaha, wonder]
\n", + "

20000 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " index target id date flag \\\n", + "0 788923 0 2325257801 Thu Jun 25 05:05:48 PDT 2009 NO_QUERY \n", + "1 91290 0 1759452127 Sun May 10 19:14:36 PDT 2009 NO_QUERY \n", + "2 841280 4 1562361577 Sun Apr 19 19:48:09 PDT 2009 NO_QUERY \n", + "3 1530496 4 2177820795 Mon Jun 15 06:45:24 PDT 2009 NO_QUERY \n", + "4 771592 0 2302441207 Tue Jun 23 16:48:46 PDT 2009 NO_QUERY \n", + "... ... ... ... ... ... \n", + "19995 331165 0 2012563615 Tue Jun 02 20:52:47 PDT 2009 NO_QUERY \n", + "19996 1202127 4 1985803236 Sun May 31 17:48:12 PDT 2009 NO_QUERY \n", + "19997 1063769 4 1964544524 Fri May 29 14:24:23 PDT 2009 NO_QUERY \n", + "19998 13456 0 1553264085 Sat Apr 18 13:48:15 PDT 2009 NO_QUERY \n", + "19999 1423044 4 2058622571 Sat Jun 06 15:18:17 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "0 makahiya sad and got hurt from a friend's comment \n", + "1 justkarii I want chris back :'( how did things get this ... \n", + "2 blueyesblue @vnakic dude, i'm surprised you found me \n", + "3 monnie Hey Internet! You know that thing you can get ... \n", + "4 backpocketblues @rawralyrawr apparently... "dick rails&qu... \n", + "... ... ... \n", + "19995 MizMari @seanjissomean dude jk LOL i'll miss ya wing m... \n", + "19996 LexiSunshine Hey @x17online zaaaaaaac <333 so now all th... \n", + "19997 aeonbeat @faces i was looking for the book two days ago... \n", + "19998 angx http://twitpic.com/3jqma - I wish they fit ma... \n", + "19999 sarahbrowntown @Shannoncurrie bahaha! you're wonderful \n", + "\n", + " text_processed \n", + "0 [sad, got, hurt, friend, comment] \n", + "1 [want, chris, back, thing, get, fuck, miss, fr... \n", + "2 [vnakic, dude, surpris, found] \n", + "3 [hey, internet, know, thing, get, put, memori,... \n", + "4 [rawralyrawr, appar, quot, dick, rail, quot, g... \n", + "... ... \n", + "19995 [seanjissomean, dude, jk, lol, miss, ya, wing,... \n", + "19996 [hey, x, onlin, zaaaaaaac, lt, import, peopl, ... \n", + "19997 [face, wa, look, book, two, day, ago, burga, f... \n", + "19998 [wish, fit, mayb, stuff, lot] \n", + "19999 [shannoncurri, bahaha, wonder] \n", + "\n", + "[20000 rows x 8 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#applying previous function to the sample\n", + "\n", + "sampled_tweets['text_processed'] = sampled_tweets['text'].apply(tokenize_stem_lemmatize_stopwords)\n", + "sampled_tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9f9caf6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextargetiddateflagusertexttext_processed
078892302325257801Thu Jun 25 05:05:48 PDT 2009NO_QUERYmakahiyasad and got hurt from a friend's commentsad got hurt friend comment
19129001759452127Sun May 10 19:14:36 PDT 2009NO_QUERYjustkariiI want chris back :'( how did things get this ...want chris back thing get fuck miss friend
284128041562361577Sun Apr 19 19:48:09 PDT 2009NO_QUERYblueyesblue@vnakic dude, i'm surprised you found mevnakic dude surpris found
3153049642177820795Mon Jun 15 06:45:24 PDT 2009NO_QUERYmonnieHey Internet! You know that thing you can get ...hey internet know thing get put memori card pu...
477159202302441207Tue Jun 23 16:48:46 PDT 2009NO_QUERYbackpocketblues@rawralyrawr apparently... &quot;dick rails&qu...rawralyrawr appar quot dick rail quot girl two...
\n", + "
" + ], + "text/plain": [ + " index target id date flag \\\n", + "0 788923 0 2325257801 Thu Jun 25 05:05:48 PDT 2009 NO_QUERY \n", + "1 91290 0 1759452127 Sun May 10 19:14:36 PDT 2009 NO_QUERY \n", + "2 841280 4 1562361577 Sun Apr 19 19:48:09 PDT 2009 NO_QUERY \n", + "3 1530496 4 2177820795 Mon Jun 15 06:45:24 PDT 2009 NO_QUERY \n", + "4 771592 0 2302441207 Tue Jun 23 16:48:46 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "0 makahiya sad and got hurt from a friend's comment \n", + "1 justkarii I want chris back :'( how did things get this ... \n", + "2 blueyesblue @vnakic dude, i'm surprised you found me \n", + "3 monnie Hey Internet! You know that thing you can get ... \n", + "4 backpocketblues @rawralyrawr apparently... "dick rails&qu... \n", + "\n", + " text_processed \n", + "0 sad got hurt friend comment \n", + "1 want chris back thing get fuck miss friend \n", + "2 vnakic dude surpris found \n", + "3 hey internet know thing get put memori card pu... \n", + "4 rawralyrawr appar quot dick rail quot girl two... " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def re_blob(row):\n", + " return \" \".join(row['text_processed'])\n", + "\n", + "sampled_tweets['text_processed'] = sampled_tweets.apply(re_blob,axis=1)\n", + "sampled_tweets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "06832ee5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextargetiddateflagusertexttext_processedclean_blob
044205002067173325Sun Jun 07 11:52:50 PDT 2009NO_QUERYrike90woooohoo!? what the hell... it's cold[woooohoo, hell, cold]woooohoo hell cold
353575602198105892Tue Jun 16 15:36:42 PDT 2009NO_QUERYRobbie_Taylorwhere those *&amp; ^% Iphone 3.0 update remain...[amp, iphon, updat, remain, allreadi, midnight]amp iphon updat remain allreadi midnight
763112102232222174Thu Jun 18 19:37:09 PDT 2009NO_QUERYGinaXPi just experienced the 1st time of falling asl...[experienc, st, time, fall, asleep, wake, scre...experienc st time fall asleep wake scream beca...
919176101969708462Sat May 30 00:56:40 PDT 2009NO_QUERYAmy_E_WGetting glasses today[get, glass, today]get glass today
1268196302249873888Fri Jun 19 23:11:58 PDT 2009NO_QUERYRacheLyn5485@boysforpele32 Awww....that's sad[boysforpel, sad]boysforpel sad
..............................
1999034749802016485849Wed Jun 03 06:51:39 PDT 2009NO_QUERYnaomijlea@Rufus_Jay oww I thought it might be your gran...[rufus, jay, oww, thought, might, grandad, som...rufus jay oww thought might grandad someth oh ...
1999154705502201959167Tue Jun 16 21:01:39 PDT 2009NO_QUERYeponaproblemosIs super grossed out by the huge pimple![super, gross, huge, pimpl]super gross huge pimpl
199963219101564195769Mon Apr 20 02:15:53 PDT 2009NO_QUERYroban09Going to bed... did not finish MC[go, bed, finish, mc]go bed finish mc
1999877144702302366040Tue Jun 23 16:42:25 PDT 2009NO_QUERYjlbbb143@Bob_Roony yup[bob, rooni, yup]bob rooni yup
1999916538801960810082Fri May 29 08:32:21 PDT 2009NO_QUERYcbond007I don't think I'm very well. Just in time for ...[think, veri, well, time, weekend]think veri well time weekend
\n", + "

9992 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " index target id date flag \\\n", + "0 442050 0 2067173325 Sun Jun 07 11:52:50 PDT 2009 NO_QUERY \n", + "3 535756 0 2198105892 Tue Jun 16 15:36:42 PDT 2009 NO_QUERY \n", + "7 631121 0 2232222174 Thu Jun 18 19:37:09 PDT 2009 NO_QUERY \n", + "9 191761 0 1969708462 Sat May 30 00:56:40 PDT 2009 NO_QUERY \n", + "12 681963 0 2249873888 Fri Jun 19 23:11:58 PDT 2009 NO_QUERY \n", + "... ... ... ... ... ... \n", + "19990 347498 0 2016485849 Wed Jun 03 06:51:39 PDT 2009 NO_QUERY \n", + "19991 547055 0 2201959167 Tue Jun 16 21:01:39 PDT 2009 NO_QUERY \n", + "19996 32191 0 1564195769 Mon Apr 20 02:15:53 PDT 2009 NO_QUERY \n", + "19998 771447 0 2302366040 Tue Jun 23 16:42:25 PDT 2009 NO_QUERY \n", + "19999 165388 0 1960810082 Fri May 29 08:32:21 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "0 rike90 woooohoo!? what the hell... it's cold \n", + "3 Robbie_Taylor where those *& ^% Iphone 3.0 update remain... \n", + "7 GinaXP i just experienced the 1st time of falling asl... \n", + "9 Amy_E_W Getting glasses today \n", + "12 RacheLyn5485 @boysforpele32 Awww....that's sad \n", + "... ... ... \n", + "19990 naomijlea @Rufus_Jay oww I thought it might be your gran... \n", + "19991 eponaproblemos Is super grossed out by the huge pimple! \n", + "19996 roban09 Going to bed... did not finish MC \n", + "19998 jlbbb143 @Bob_Roony yup \n", + "19999 cbond007 I don't think I'm very well. Just in time for ... \n", + "\n", + " text_processed \\\n", + "0 [woooohoo, hell, cold] \n", + "3 [amp, iphon, updat, remain, allreadi, midnight] \n", + "7 [experienc, st, time, fall, asleep, wake, scre... \n", + "9 [get, glass, today] \n", + "12 [boysforpel, sad] \n", + "... ... \n", + "19990 [rufus, jay, oww, thought, might, grandad, som... \n", + "19991 [super, gross, huge, pimpl] \n", + "19996 [go, bed, finish, mc] \n", + "19998 [bob, rooni, yup] \n", + "19999 [think, veri, well, time, weekend] \n", + "\n", + " clean_blob \n", + "0 woooohoo hell cold \n", + "3 amp iphon updat remain allreadi midnight \n", + "7 experienc st time fall asleep wake scream beca... \n", + "9 get glass today \n", + "12 boysforpel sad \n", + "... ... \n", + "19990 rufus jay oww thought might grandad someth oh ... \n", + "19991 super gross huge pimpl \n", + "19996 go bed finish mc \n", + "19998 bob rooni yup \n", + "19999 think veri well time weekend \n", + "\n", + "[9992 rows x 9 columns]" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neg_tweets = sampled_tweets[sampled_tweets['target']==0]\n", + "neg_tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "8f37ca5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indextargetiddateflagusertexttext_processedclean_blob
1156190442186697271Mon Jun 15 19:12:26 PDT 2009NO_QUERYjmillz1214@ANGELEYESBABYFA hey sexy how u doin[angeleyesbabyfa, hey, sexi, u, doin]angeleyesbabyfa hey sexi u doin
2148661142068115601Sun Jun 07 13:31:30 PDT 2009NO_QUERYLaChiinalinda@Yardydp81 Thankyou[yardydp, thankyou]yardydp thankyou
486913641678071815Sat May 02 05:22:25 PDT 2009NO_QUERYHayliieeXoListening to TMF music &amp;&amp; browsing on ...[listen, tmf, music, amp, amp, brow, internet]listen tmf music amp amp brow internet
5158778842190992146Tue Jun 16 04:15:33 PDT 2009NO_QUERYRoryBecker@Ben_Hall Nothing I can talk about[ben, hall, noth, talk]ben hall noth talk
6114939441978335050Sat May 30 23:03:24 PDT 2009NO_QUERYenamoredsoul@Owais_Iqbal like what songs!?!? please do sha...[owai, iqbal, like, song, plea, share]owai iqbal like song plea share
..............................
1999280269641468268777Tue Apr 07 00:39:04 PDT 2009NO_QUERYanswersonlyMeatloaf- I love it![meatloaf, love]meatloaf love
19993134608942044194612Fri Jun 05 09:00:25 PDT 2009NO_QUERYJennifuuurrrat home[home]home
19994104082441957002408Thu May 28 23:14:55 PDT 2009NO_QUERYjjbaby85BrB...about to eat, drink and dance[brb, eat, drink, danc]brb eat drink danc
19995100831641880790228Fri May 22 02:30:52 PDT 2009NO_QUERYbletheringfool@Gailporter good news all round[gailport, good, news, round]gailport good news round
19997111508441972898225Sat May 30 10:05:37 PDT 2009NO_QUERYautumnrobinI woke up about an hour ago. But still laying ...[woke, hour, ago, still, lay, bed, got, ta, nail]woke hour ago still lay bed got ta nail
\n", + "

10008 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " index target id date flag \\\n", + "1 1561904 4 2186697271 Mon Jun 15 19:12:26 PDT 2009 NO_QUERY \n", + "2 1486611 4 2068115601 Sun Jun 07 13:31:30 PDT 2009 NO_QUERY \n", + "4 869136 4 1678071815 Sat May 02 05:22:25 PDT 2009 NO_QUERY \n", + "5 1587788 4 2190992146 Tue Jun 16 04:15:33 PDT 2009 NO_QUERY \n", + "6 1149394 4 1978335050 Sat May 30 23:03:24 PDT 2009 NO_QUERY \n", + "... ... ... ... ... ... \n", + "19992 802696 4 1468268777 Tue Apr 07 00:39:04 PDT 2009 NO_QUERY \n", + "19993 1346089 4 2044194612 Fri Jun 05 09:00:25 PDT 2009 NO_QUERY \n", + "19994 1040824 4 1957002408 Thu May 28 23:14:55 PDT 2009 NO_QUERY \n", + "19995 1008316 4 1880790228 Fri May 22 02:30:52 PDT 2009 NO_QUERY \n", + "19997 1115084 4 1972898225 Sat May 30 10:05:37 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "1 jmillz1214 @ANGELEYESBABYFA hey sexy how u doin \n", + "2 LaChiinalinda @Yardydp81 Thankyou \n", + "4 HayliieeXo Listening to TMF music && browsing on ... \n", + "5 RoryBecker @Ben_Hall Nothing I can talk about \n", + "6 enamoredsoul @Owais_Iqbal like what songs!?!? please do sha... \n", + "... ... ... \n", + "19992 answersonly Meatloaf- I love it! \n", + "19993 Jennifuuurrr at home \n", + "19994 jjbaby85 BrB...about to eat, drink and dance \n", + "19995 bletheringfool @Gailporter good news all round \n", + "19997 autumnrobin I woke up about an hour ago. But still laying ... \n", + "\n", + " text_processed \\\n", + "1 [angeleyesbabyfa, hey, sexi, u, doin] \n", + "2 [yardydp, thankyou] \n", + "4 [listen, tmf, music, amp, amp, brow, internet] \n", + "5 [ben, hall, noth, talk] \n", + "6 [owai, iqbal, like, song, plea, share] \n", + "... ... \n", + "19992 [meatloaf, love] \n", + "19993 [home] \n", + "19994 [brb, eat, drink, danc] \n", + "19995 [gailport, good, news, round] \n", + "19997 [woke, hour, ago, still, lay, bed, got, ta, nail] \n", + "\n", + " clean_blob \n", + "1 angeleyesbabyfa hey sexi u doin \n", + "2 yardydp thankyou \n", + "4 listen tmf music amp amp brow internet \n", + "5 ben hall noth talk \n", + "6 owai iqbal like song plea share \n", + "... ... \n", + "19992 meatloaf love \n", + "19993 home \n", + "19994 brb eat drink danc \n", + "19995 gailport good news round \n", + "19997 woke hour ago still lay bed got ta nail \n", + "\n", + "[10008 rows x 9 columns]" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pos_tweets = sampled_tweets[sampled_tweets['target']==4]\n", + "pos_tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b9d8640f", + "metadata": {}, + "outputs": [], + "source": [ + "#let's take only the most common 1000 words\n", + "bow_vect = CountVectorizer(max_features=1000)\n", + "# fit creates one entry for each different word seen\n", + "X = bow_vect.fit_transform(sampled_tweets['text_processed']).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "43ca94ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ablabsolutaccountachactactualadaddaddictador...yeahyearyepyesyesterdayyetyoyoutubyryup
00000000000...0000000000
10000000000...0000000000
20000000000...0000000000
30000000000...0000000000
40000000000...0000000000
50000000000...0000000000
60000000000...0000000000
70000000000...0000000000
80000000000...0000000000
90000000000...0000000000
\n", + "

10 rows × 1000 columns

\n", + "
" + ], + "text/plain": [ + " abl absolut account ach act actual ad add addict ador ... yeah \\\n", + "0 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "1 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "2 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "3 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "4 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "5 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "6 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "7 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "8 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "9 0 0 0 0 0 0 0 0 0 0 ... 0 \n", + "\n", + " year yep yes yesterday yet yo youtub yr yup \n", + "0 0 0 0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 0 0 \n", + "5 0 0 0 0 0 0 0 0 0 \n", + "6 0 0 0 0 0 0 0 0 0 \n", + "7 0 0 0 0 0 0 0 0 0 \n", + "8 0 0 0 0 0 0 0 0 0 \n", + "9 0 0 0 0 0 0 0 0 0 \n", + "\n", + "[10 rows x 1000 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "as_df = pd.DataFrame(X,columns=bow_vect.get_feature_names_out())\n", + "as_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "87456242", + "metadata": {}, + "outputs": [], + "source": [ + "#dividing tweets into two clusters\n", + "\n", + "from sklearn.cluster import KMeans\n", + "kmeans = KMeans(n_clusters=2, random_state=0)\n", + "kmeans.fit(X)\n", + "pred = kmeans.predict(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ee86fd48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textclass
0sad and got hurt from a friend's comment1
1I want chris back :'( how did things get this ...1
2@vnakic dude, i'm surprised you found me1
3Hey Internet! You know that thing you can get ...1
4@rawralyrawr apparently... &quot;dick rails&qu...1
\n", + "
" + ], + "text/plain": [ + " text class\n", + "0 sad and got hurt from a friend's comment 1\n", + "1 I want chris back :'( how did things get this ... 1\n", + "2 @vnakic dude, i'm surprised you found me 1\n", + "3 Hey Internet! You know that thing you can get ... 1\n", + "4 @rawralyrawr apparently... "dick rails&qu... 1" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_df = pd.concat([sampled_tweets['text'],pd.DataFrame(pred,columns=['class'])],axis=1)\n", + "predict_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "d69dde11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textclass
8hey tweets whats going on this early morn cant...0
17Going back to bed after a stressful match0
33Nearly home about 2 and a half hours left to go0
37@SdVintageVixen Awwwww! if you guys wanna com...0
55@DrLisaTurner I was going to 'attend' but find...0
.........
199133rd was soooooo NOT the way to go...ughhhh fee...0
19932Dinner first, going home soon, then to Island ...0
19933Titus was ahhh-some... now I am going to bed0
19963is playing green day as loud as it will go0
19981Feeling down today, miss my boo and kinda wann...0
\n", + "

1657 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " text class\n", + "8 hey tweets whats going on this early morn cant... 0\n", + "17 Going back to bed after a stressful match 0\n", + "33 Nearly home about 2 and a half hours left to go 0\n", + "37 @SdVintageVixen Awwwww! if you guys wanna com... 0\n", + "55 @DrLisaTurner I was going to 'attend' but find... 0\n", + "... ... ...\n", + "19913 3rd was soooooo NOT the way to go...ughhhh fee... 0\n", + "19932 Dinner first, going home soon, then to Island ... 0\n", + "19933 Titus was ahhh-some... now I am going to bed 0\n", + "19963 is playing green day as loud as it will go 0\n", + "19981 Feeling down today, miss my boo and kinda wann... 0\n", + "\n", + "[1657 rows x 2 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#cluster 0\n", + "\n", + "predict_df[predict_df['class'] == 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4606b22f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textclass
0sad and got hurt from a friend's comment1
1I want chris back :'( how did things get this ...1
2@vnakic dude, i'm surprised you found me1
3Hey Internet! You know that thing you can get ...1
4@rawralyrawr apparently... &quot;dick rails&qu...1
.........
19995@seanjissomean dude jk LOL i'll miss ya wing m...1
19996Hey @x17online zaaaaaaac &lt;333 so now all th...1
19997@faces i was looking for the book two days ago...1
19998http://twitpic.com/3jqma - I wish they fit ma...1
19999@Shannoncurrie bahaha! you're wonderful1
\n", + "

18343 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " text class\n", + "0 sad and got hurt from a friend's comment 1\n", + "1 I want chris back :'( how did things get this ... 1\n", + "2 @vnakic dude, i'm surprised you found me 1\n", + "3 Hey Internet! You know that thing you can get ... 1\n", + "4 @rawralyrawr apparently... "dick rails&qu... 1\n", + "... ... ...\n", + "19995 @seanjissomean dude jk LOL i'll miss ya wing m... 1\n", + "19996 Hey @x17online zaaaaaaac <333 so now all th... 1\n", + "19997 @faces i was looking for the book two days ago... 1\n", + "19998 http://twitpic.com/3jqma - I wish they fit ma... 1\n", + "19999 @Shannoncurrie bahaha! you're wonderful 1\n", + "\n", + "[18343 rows x 2 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#cluster 1\n", + "\n", + "predict_df[predict_df['class'] == 1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}