diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..e292086 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,9 +66,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], + "source": [ + "import re\n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ironhack s q website is'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def clean_up(s):\n", " \"\"\"\n", @@ -79,7 +100,14 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + " \n", + "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n", + "\n", + "test_string = clean_up(test)\n", + "test_string" ] }, { @@ -101,9 +129,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def tokenize(s):\n", " \"\"\"\n", @@ -114,7 +153,11 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "test_string = tokenize(test_string)\n", + "test_string" ] }, { @@ -145,11 +188,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 's', 'q', 'websit', 'is']\n" + ] + } + ], "source": [ "def stem_and_lemmatize(l):\n", + " \n", " \"\"\"\n", " Perform stemming and lemmatization on a list of words.\n", "\n", @@ -158,7 +210,17 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l2 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " l2 += [s]\n", + " \n", + " return l2\n" ] }, { @@ -176,10 +238,19 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ironhack q website\n" + ] + } + ], "source": [ + "from nltk.corpus import stopwords\n", "def remove_stopwords(l):\n", " \"\"\"\n", " Remove English stopwords from a list of strings.\n", @@ -189,7 +260,12 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + "\n", + " return ' '.join([w for w in l if w not in stop_words])\n", + "\n", + "print(remove_stopwords(test_string))" ] }, { @@ -204,7 +280,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -218,7 +294,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..6c11cb8 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -37,20 +37,109 @@ "\n", "### Loading and Exploring Data\n", "\n", - "The dataset we'll be using today is located on Kaggle (https://www.kaggle.com/kazanova/sentiment140). Once you have downloaded and imported the dataset, it you will need to define the columns names: df.columns = ['target','id','date','flag','user','text']\n", + "The dataset we'll be using today is located in the lab directory named `Sentiment140.csv.zip`. You need to unzip it into a `.csv` file. Then in the cell below, load and explore the data.\n", "\n", "*Notes:* \n", "\n", + "* The dataset was downloaded from [Kaggle](https://www.kaggle.com/kazanova/sentiment140). We made a slight change on the original data so that each column has a label.\n", + "\n", "* The dataset is huuuuge (1.6m tweets). When you develop your data analysis codes, you can sample a subset of the data (e.g. 20k records) so that you will save a lot of time when you test your codes." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "import pandas as pd\n", + "import numpy as np\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "import nltk\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from nltk.probability import ConditionalFreqDist" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_up(s):\n", + " \"\"\"\n", + " Cleans up numbers, URLs, and special characters from a string.\n", + "\n", + " Args:\n", + " s: The string to be cleaned up.\n", + "\n", + " Returns:\n", + " A string that has been cleaned up.\n", + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + "\n", + "def tokenize(s):\n", + " \"\"\"\n", + " Tokenize a string.\n", + "\n", + " Args:\n", + " s: String to be tokenized.\n", + "\n", + " Returns:\n", + " A list of words as the result of tokenization.\n", + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "def stem_and_lemmatize(l):\n", + " \n", + " \"\"\"\n", + " Perform stemming and lemmatization on a list of words.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after being stemmed and lemmatized.\n", + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l2 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " l2 += [s]\n", + " \n", + " return l2\n", + "\n", + "\n", + "def remove_stopwords(l):\n", + " \"\"\"\n", + " Remove English stopwords from a list of strings.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after stop words are removed.\n", + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + "\n", + " return [w for w in l if w not in stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "tweets = pd.read_csv('C:/Users/Zaca/Documents/Datasets/sentiment140.csv')\n", + "sample = tweets.sample(20000)\n", + "sample['target'] = sample['target'].replace(4, 1)" ] }, { @@ -76,11 +165,206 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertexttext_processed
47168602176677449Mon Jun 15 04:29:23 PDT 2009NO_QUERYJessicaShireeLast day of classes with my Day 1 kids[last, day, class, day, kid]
156875412188268260Mon Jun 15 21:35:26 PDT 2009NO_QUERYlcmelodyRant over. Now it's time for me to actually ge...[rant, time, actual, get]
151417012175446548Mon Jun 15 01:00:40 PDT 2009NO_QUERYtamzinaki@Tsaksonakis love that song.don't remember it ...[tsaksonaki, love, song, rememb, bit, f, amp, ...
101326911881296235Fri May 22 04:14:15 PDT 2009NO_QUERYEghie_DyI'm so happy[happi]
44361302067603654Sun Jun 07 12:38:14 PDT 2009NO_QUERYicysun23@decorus I DONT KNOW!!![decoru, dont, know]
........................
111039111972049147Sat May 30 08:27:15 PDT 2009NO_QUERYiCasandy@SoulGlowActivtr dat song is da bomb!!![soulglowactivtr, dat, song, da, bomb]
64930602237251942Fri Jun 19 05:28:19 PDT 2009NO_QUERYkakaxoI go to the hairdresser and then to friends ...[go, hairdress, friend, night, lt, ugli, weath...
89419511692177930Sun May 03 19:26:28 PDT 2009NO_QUERYewindsor@aLINEofCOCJIN Ah awesome. Good to hear from ...[alineofcocjin, ah, awesom, good, hear]
5769801685723636Sun May 03 01:45:08 PDT 2009NO_QUERYacaigirl@Avie89 sorry to hear that...[avi, sorri, hear]
131418812013877292Tue Jun 02 23:49:36 PDT 2009NO_QUERYJellie1981waiting for the train! http://yfrog.com/eha8sj[wait, train]
\n", + "

20000 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "471686 0 2176677449 Mon Jun 15 04:29:23 PDT 2009 NO_QUERY \n", + "1568754 1 2188268260 Mon Jun 15 21:35:26 PDT 2009 NO_QUERY \n", + "1514170 1 2175446548 Mon Jun 15 01:00:40 PDT 2009 NO_QUERY \n", + "1013269 1 1881296235 Fri May 22 04:14:15 PDT 2009 NO_QUERY \n", + "443613 0 2067603654 Sun Jun 07 12:38:14 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "1110391 1 1972049147 Sat May 30 08:27:15 PDT 2009 NO_QUERY \n", + "649306 0 2237251942 Fri Jun 19 05:28:19 PDT 2009 NO_QUERY \n", + "894195 1 1692177930 Sun May 03 19:26:28 PDT 2009 NO_QUERY \n", + "57698 0 1685723636 Sun May 03 01:45:08 PDT 2009 NO_QUERY \n", + "1314188 1 2013877292 Tue Jun 02 23:49:36 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "471686 JessicaShiree Last day of classes with my Day 1 kids \n", + "1568754 lcmelody Rant over. Now it's time for me to actually ge... \n", + "1514170 tamzinaki @Tsaksonakis love that song.don't remember it ... \n", + "1013269 Eghie_Dy I'm so happy \n", + "443613 icysun23 @decorus I DONT KNOW!!! \n", + "... ... ... \n", + "1110391 iCasandy @SoulGlowActivtr dat song is da bomb!!! \n", + "649306 kakaxo I go to the hairdresser and then to friends ... \n", + "894195 ewindsor @aLINEofCOCJIN Ah awesome. Good to hear from ... \n", + "57698 acaigirl @Avie89 sorry to hear that... \n", + "1314188 Jellie1981 waiting for the train! http://yfrog.com/eha8sj \n", + "\n", + " text_processed \n", + "471686 [last, day, class, day, kid] \n", + "1568754 [rant, time, actual, get] \n", + "1514170 [tsaksonaki, love, song, rememb, bit, f, amp, ... \n", + "1013269 [happi] \n", + "443613 [decoru, dont, know] \n", + "... ... \n", + "1110391 [soulglowactivtr, dat, song, da, bomb] \n", + "649306 [go, hairdress, friend, night, lt, ugli, weath... \n", + "894195 [alineofcocjin, ah, awesom, good, hear] \n", + "57698 [avi, sorri, hear] \n", + "1314188 [wait, train] \n", + "\n", + "[20000 rows x 7 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "sample['text_processed'] = sample['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)\n", + "sample" ] }, { @@ -98,11 +382,1029 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['last',\n", + " 'day',\n", + " 'class',\n", + " 'kid',\n", + " 'rant',\n", + " 'time',\n", + " 'actual',\n", + " 'get',\n", + " 'tsaksonaki',\n", + " 'love',\n", + " 'song',\n", + " 'rememb',\n", + " 'bit',\n", + " 'f',\n", + " 'amp',\n", + " 'l',\n", + " 'saw',\n", + " 'actor',\n", + " 'studio',\n", + " 'happi',\n", + " 'decoru',\n", + " 'dont',\n", + " 'know',\n", + " 'rebeccamezzino',\n", + " 'hello',\n", + " 'bec',\n", + " 'long',\n", + " 'melbourn',\n", + " 'abl',\n", + " 'make',\n", + " 'tweetup',\n", + " 'ugh',\n", + " 'im',\n", + " 'tire',\n", + " 'hardli',\n", + " 'anyth',\n", + " 'play',\n", + " 'niec',\n", + " 'goin',\n", + " 'bed',\n", + " 'goodnight',\n", + " 'eveyon',\n", + " 'lt',\n", + " 'ha',\n", + " 'reject',\n", + " 'back',\n", + " 'amaz',\n", + " 'two',\n", + " 'never',\n", + " 'forget',\n", + " 'feel',\n", + " 'break',\n", + " 'pool',\n", + " 'swim',\n", + " 'middl',\n", + " 'lightn',\n", + " 'storm',\n", + " 'sunni',\n", + " 'xtineismyhero',\n", + " 'happen',\n", + " 'darl',\n", + " 'laffit',\n", + " 'still',\n", + " 'salon',\n", + " 'send',\n", + " 'pic',\n", + " 'eaten',\n", + " 'aliv',\n", + " 'mosquito',\n", + " 'post',\n", + " 'waaaaay',\n", + " 'much',\n", + " 'forgot',\n", + " 'wa',\n", + " 'gon',\n", + " 'na',\n", + " 'say',\n", + " 'mom',\n", + " 'invit',\n", + " 'parti',\n", + " 'go',\n", + " 'though',\n", + " 'veilin',\n", + " 'miss',\n", + " 'see',\n", + " 'like',\n", + " 'month',\n", + " 'alreadi',\n", + " 'think',\n", + " 'look',\n", + " 'xd',\n", + " 'chemistri',\n", + " 'revison',\n", + " 'well',\n", + " 'bore',\n", + " 'hate',\n", + " 'peopl',\n", + " 'finish',\n", + " 'exam',\n", + " 'festivalfan',\n", + " 'u',\n", + " 'start',\n", + " 'eat',\n", + " 'carb',\n", + " 'ur',\n", + " 'bodi',\n", + " 'wont',\n", + " 'weight',\n", + " 'come',\n", + " 'fast',\n", + " 'ive',\n", + " 'clean',\n", + " 'deserv',\n", + " 'recognit',\n", + " 'mother',\n", + " 'caus',\n", + " 'problem',\n", + " 'instead',\n", + " 'montyrul',\n", + " 'pearllow',\n", + " 'andi',\n", + " 'wish',\n", + " 'luck',\n", + " 'gig',\n", + " 'glad',\n", + " 'went',\n", + " 'glasto',\n", + " 'xxxxxx',\n", + " 'got',\n", + " 'ta',\n", + " 'pack',\n", + " 'trip',\n", + " 'daddi',\n", + " 'granni',\n", + " 'poo',\n", + " 'wait',\n", + " 'midnight',\n", + " 'jona',\n", + " 'brother',\n", + " 'new',\n", + " 'album',\n", + " 'tomorrow',\n", + " 'dead',\n", + " 'zoran',\n", + " 'lost',\n", + " 'croatian',\n", + " 'idol',\n", + " 'differ',\n", + " 'le',\n", + " 'vote',\n", + " 'prepar',\n", + " 'univers',\n", + " 'great',\n", + " 'hook',\n", + " 'learn',\n", + " 'read',\n", + " 'yesterday',\n", + " 'hope',\n", + " 'sloan',\n", + " 'download',\n", + " 'trvsdjam',\n", + " 'mixtap',\n", + " 'unzip',\n", + " 'invalid',\n", + " 'corrupt',\n", + " 'krishnakum',\n", + " 'told',\n", + " 'ya',\n", + " 'uber',\n", + " 'cool',\n", + " 'treat',\n", + " 'omen',\n", + " 'yoo',\n", + " 'sittin',\n", + " 'car',\n", + " 'crazi',\n", + " 'ish',\n", + " 'rite',\n", + " 'man',\n", + " 'ima',\n", + " 'nitemar',\n", + " 'tonit',\n", + " 'bryci',\n", + " 'seen',\n", + " 'ure',\n", + " 'stpatrick',\n", + " 'show',\n", + " 'amazin',\n", + " 'person',\n", + " 'take',\n", + " 'earlier',\n", + " 'comment',\n", + " 'b',\n", + " 'superstar',\n", + " 'synwpn',\n", + " 'whi',\n", + " 'wan',\n", + " 'stalk',\n", + " 'anyway',\n", + " 'chelseamoss',\n", + " 'everyth',\n", + " 'summer',\n", + " 'far',\n", + " 'good',\n", + " 'ashleigharsen',\n", + " 'also',\n", + " 'lemonhead',\n", + " 'delici',\n", + " 'plea',\n", + " 'compar',\n", + " 'one',\n", + " 'latinegro',\n", + " 'unfortun',\n", + " 'red',\n", + " 'robin',\n", + " 'famili',\n", + " 'friend',\n", + " 'perfect',\n", + " 'church',\n", + " 'lunch',\n", + " 'lora',\n", + " 'text',\n", + " 'around',\n", + " 'watch',\n", + " 'oc',\n", + " 'ye',\n", + " 'knockin',\n", + " 'night',\n", + " 'pointforwardpro',\n", + " 'omg',\n", + " 'thnk',\n", + " 'remind',\n", + " 'colleg',\n", + " 'right',\n", + " 'everytim',\n", + " 'spend',\n", + " 'front',\n", + " 'comput',\n", + " 'hurt',\n", + " 'home',\n", + " 'put',\n", + " 'bedtim',\n", + " 'becaus',\n", + " 'want',\n", + " 'sleep',\n", + " 'alon',\n", + " 'feliz',\n", + " 'de',\n", + " 'la',\n", + " 'madr',\n", + " 'godmommi',\n", + " 'sign',\n", + " 'contract',\n", + " 'apart',\n", + " 'need',\n", + " 'email',\n", + " 'contact',\n", + " 'check',\n", + " 'emailunlimit',\n", + " 'final',\n", + " 'broke',\n", + " 'hi',\n", + " 'casino',\n", + " 'habit',\n", + " 'lose',\n", + " 'next',\n", + " 'week',\n", + " 'afraid',\n", + " 'nightmar',\n", + " 'chees',\n", + " 'befor',\n", + " 'effect',\n", + " 'ravioli',\n", + " 'grill',\n", + " 'twitter',\n", + " 'product',\n", + " 'velvet',\n", + " 'cake',\n", + " 'speak',\n", + " 'sofiedevil',\n", + " 'left',\n", + " 'phone',\n", + " 'lie',\n", + " 'cough',\n", + " 'wonder',\n", + " 'anyon',\n", + " 'die',\n", + " 'seedi',\n", + " 'motel',\n", + " 'zombi',\n", + " 'shark',\n", + " 'tuesday',\n", + " 'woke',\n", + " 'pm',\n", + " 'fall',\n", + " 'asleep',\n", + " 'suck',\n", + " 'hour',\n", + " 'ie',\n", + " 'greek',\n", + " 'easter',\n", + " 'asian',\n", + " 'twist',\n", + " 'ashalale',\n", + " 'soo',\n", + " 'creativ',\n", + " 'gt',\n", + " 'thi',\n", + " 'must',\n", + " 'find',\n", + " 'someth',\n", + " 'fun',\n", + " 'blkpanther',\n", + " 'way',\n", + " 'seem',\n", + " 'somali',\n", + " 'civil',\n", + " 'war',\n", + " 'roto',\n", + " 'sadli',\n", + " 'espn',\n", + " 'gener',\n", + " 'medium',\n", + " 'kind',\n", + " 'thing',\n", + " 'amournoir',\n", + " 'work',\n", + " 'casualcottag',\n", + " 'big',\n", + " 'smiley',\n", + " 'face',\n", + " 'mybigg',\n", + " 'lolz',\n", + " 'wnba',\n", + " 'wrong',\n", + " 'live',\n", + " 'access',\n", + " 'graphic',\n", + " 'blog',\n", + " 'oprah',\n", + " 'thank',\n", + " 'share',\n", + " 'alexrk',\n", + " 'haha',\n", + " 'true',\n", + " 'alex',\n", + " 'ahhh',\n", + " 'anoth',\n", + " 'soon',\n", + " 'hmmm',\n", + " 'white',\n", + " 'tri',\n", + " 'chase',\n", + " 'couch',\n", + " 'yell',\n", + " 'rm',\n", + " 'poorer',\n", + " 'accident',\n", + " 'gave',\n", + " 'discount',\n", + " 'magazin',\n", + " 'sob',\n", + " 'twit',\n", + " 'juli',\n", + " 'nope',\n", + " 'yet',\n", + " 'scienc',\n", + " 'summ',\n", + " 'today',\n", + " 'jame',\n", + " 'buckley',\n", + " 'would',\n", + " 'end',\n", + " 'till',\n", + " 'weekend',\n", + " 'reschedul',\n", + " 'shower',\n", + " 'water',\n", + " 'frozen',\n", + " 'readi',\n", + " 'alway',\n", + " 'room',\n", + " 'veri',\n", + " 'hot',\n", + " 'sazp',\n", + " 'lush',\n", + " 'realiz',\n", + " 'tune',\n", + " 'thesixtyon',\n", + " 'com',\n", + " 'coupl',\n", + " 'nice',\n", + " 'daniboo',\n", + " 'hous',\n", + " 'woah',\n", + " 'rock',\n", + " 'life',\n", + " 'jesu',\n", + " 'takin',\n", + " 'step',\n", + " 'stuff',\n", + " 'food',\n", + " 'montanaon',\n", + " 'could',\n", + " 'flight',\n", + " 'agentpatgillen',\n", + " 'sure',\n", + " 'feelin',\n", + " 'trulli',\n", + " 'stune',\n", + " 'mr',\n", + " 'david',\n", + " 'carradin',\n", + " 'kungfu',\n", + " 'movi',\n", + " 'lushi',\n", + " 'dread',\n", + " 'deni',\n", + " 'danimarzillo',\n", + " 'ouch',\n", + " 'slice',\n", + " 'top',\n", + " 'finger',\n", + " 'gloriou',\n", + " 'sunshin',\n", + " 'bake',\n", + " 'browni',\n", + " 'sun',\n", + " 'yay',\n", + " 'boyl',\n", + " 'didnt',\n", + " 'win',\n", + " 'listen',\n", + " 'britney',\n", + " 'loveeess',\n", + " 'xoxo',\n", + " 'crush',\n", + " 'hardcor',\n", + " 'busi',\n", + " 'sore',\n", + " 'realli',\n", + " 'school',\n", + " 'ughh',\n", + " 'quot',\n", + " 'bbq',\n", + " 'outsid',\n", + " 'sweat',\n", + " 'smell',\n", + " 'yummi',\n", + " 'follwr',\n", + " 'drop',\n", + " 'hard',\n", + " 'tell',\n", + " 'spambot',\n", + " 'record',\n", + " 'stock',\n", + " 'nake',\n", + " 'ladi',\n", + " 'opportun',\n", + " 'dianhadinoto',\n", + " 'sweet',\n", + " 'sharlynnx',\n", + " 'aww',\n", + " 'naah',\n", + " 'favourit',\n", + " 'nighti',\n", + " 'cours',\n", + " 'delet',\n", + " 'dawson',\n", + " 'creek',\n", + " 'ol',\n", + " 'ruben',\n", + " 'spanish',\n", + " 'gp',\n", + " 'disappoint',\n", + " 'soft',\n", + " 'spot',\n", + " 'jcookonlin',\n", + " 'yeahhh',\n", + " 'mean',\n", + " 'someon',\n", + " 'fairli',\n", + " 'close',\n", + " 'knew',\n", + " 'sad',\n", + " 'camera',\n", + " 'nsenze',\n", + " 'onli',\n", + " 'wors',\n", + " 'suppos',\n", + " 'repli',\n", + " 'lalaitsmaria',\n", + " 'oh',\n", + " 'carliecarrcrash',\n", + " 'panaera',\n", + " 'four',\n", + " 'tonight',\n", + " 'best',\n", + " 'orlando',\n", + " 'girli',\n", + " 'backyard',\n", + " 'later',\n", + " 'afternoon',\n", + " 'real',\n", + " 'forev',\n", + " 'innoc',\n", + " 'word',\n", + " 'yr',\n", + " 'old',\n", + " 'hold',\n", + " 'even',\n", + " 'millionair',\n", + " 'driver',\n", + " 'spin',\n", + " 'crash',\n", + " 'super',\n", + " 'race',\n", + " 'machin',\n", + " 'raini',\n", + " 'shanghai',\n", + " 'simpli',\n", + " 'made',\n", + " 'fail',\n", + " 'darlingnickieb',\n", + " 'daaannnnggg',\n", + " 'porki',\n", + " 'xo',\n", + " 'handli',\n", + " 'mayb',\n", + " 'charlii',\n", + " 'yeah',\n", + " 'felt',\n", + " 'bad',\n", + " 'um',\n", + " 'ate',\n", + " 'rainbow',\n", + " 'paddl',\n", + " 'pop',\n", + " 'xx',\n", + " 'sethu',\n", + " 'j',\n", + " 'thought',\n", + " 'exactli',\n", + " 'fuck',\n", + " 'er',\n", + " 'girl',\n", + " 'enjoy',\n", + " 'wknd',\n", + " 'might',\n", + " 'landd',\n", + " 'foca',\n", + " 'nesslle',\n", + " 'x',\n", + " 'men',\n", + " 'fan',\n", + " 'paola',\n", + " 'total',\n", + " 'hugh',\n", + " 'gif',\n", + " 'cute',\n", + " 'norm',\n", + " 'cant',\n", + " 'tom',\n", + " 'isnt',\n", + " 'n',\n", + " 'fave',\n", + " 'guess',\n", + " 'hahahha',\n", + " 'pretti',\n", + " 'tea',\n", + " 'thigh',\n", + " 'interfac',\n", + " 'nrwi',\n", + " 'throw',\n", + " 'heavi',\n", + " 'object',\n", + " 'snore',\n", + " 'hezmcfli',\n", + " 'definit',\n", + " 'damn',\n", + " 'sick',\n", + " 'studi',\n", + " 'law',\n", + " 'leav',\n", + " 'stra',\n", + " 'doubl',\n", + " 'ea',\n", + " 'bradford',\n", + " 'aw',\n", + " 'shut',\n", + " 'lol',\n", + " 'superbad',\n", + " 'ashkiiwil',\n", + " 'sportsgirlsplay',\n", + " 'coach',\n", + " 'forc',\n", + " 'retir',\n", + " 'year',\n", + " 'ago',\n", + " 'due',\n", + " 'injuri',\n", + " 'terrinixon',\n", + " 'laugh',\n", + " 'oral',\n", + " 'present',\n", + " 'monday',\n", + " 'whole',\n", + " 'johnherman',\n", + " 'congrat',\n", + " 'deliveri',\n", + " 'trust',\n", + " 'theoshu',\n", + " 'oishi',\n", + " 'cheap',\n", + " 'satisfi',\n", + " 'sushi',\n", + " 'soup',\n", + " 'excel',\n", + " 'edward',\n", + " 'dumb',\n", + " 'twin',\n", + " 'rabbitport',\n", + " 'charact',\n", + " 'tuna',\n", + " 'sandwich',\n", + " 'done',\n", + " 'mayson',\n", + " 'youu',\n", + " 'eu',\n", + " 'queria',\n", + " 'que',\n", + " 'era',\n", + " 'rewind',\n", + " 'fo',\n", + " 'pr',\n", + " 'ximo',\n", + " 'singl',\n", + " 'ou',\n", + " 'without',\n", + " 'worst',\n", + " 'ever',\n", + " 'fml',\n", + " 'fashion',\n", + " 'statement',\n", + " 'head',\n", + " 'scarf',\n", + " 'style',\n", + " 'doin',\n", + " 'mama',\n", + " 'yoyoemma',\n", + " 'ceekaigax',\n", + " 'away',\n", + " 'fulli',\n", + " 'woken',\n", + " 'winterchick',\n", + " 'nah',\n", + " 'write',\n", + " 'wrote',\n", + " 'basic',\n", + " 'idea',\n", + " 'breakfast',\n", + " 'sat',\n", + " 'zach',\n", + " 'ashleeeyyyyy',\n", + " 'tushsharma',\n", + " 'uh',\n", + " 'huh',\n", + " 'fellow',\n", + " 'unit',\n", + " 'cooki',\n", + " 'mood',\n", + " 'bos',\n", + " 'bitch',\n", + " 'endlessli',\n", + " 'cre',\n", + " 'tvdirektr',\n", + " 'wow',\n", + " 'nurseju',\n", + " 'earli',\n", + " 'taxi',\n", + " 'worth',\n", + " 'give',\n", + " 'beamer',\n", + " 'washhhh',\n", + " 'mamzellef',\n", + " 'dad',\n", + " 'tallk',\n", + " 'fix',\n", + " 'weareleet',\n", + " 'hungov',\n", + " 'greasi',\n", + " 'noth',\n", + " 'help',\n", + " 'keep',\n", + " 'run',\n", + " 'truth',\n", + " 'hw',\n", + " 'may',\n", + " 'bother',\n", + " 'meraki',\n", + " 'blanket',\n", + " 'san',\n", + " 'fran',\n", + " 'free',\n", + " 'wi',\n", + " 'fi',\n", + " 'news',\n", + " 'sheilafightseb',\n", + " 'sheila',\n", + " 'riddl',\n", + " 'horni',\n", + " 'kitti',\n", + " 'funni',\n", + " 'accur',\n", + " 'pest',\n", + " 'whiteplum',\n", + " 'inde',\n", + " 'bet',\n", + " 'dog',\n", + " 'bun',\n", + " 'sea',\n", + " 'garbag',\n", + " 'truck',\n", + " 'guy',\n", + " 'plenti',\n", + " 'flashbelt',\n", + " 'tue',\n", + " 'wen',\n", + " 'beacus',\n", + " 'ex',\n", + " 'girlfrend',\n", + " 'plussizemommi',\n", + " 'reason',\n", + " 'open',\n", + " 'found',\n", + " 'whew',\n", + " 'collabor',\n", + " 'admit',\n", + " 'aka',\n", + " 'kristin',\n", + " 'daynaroselli',\n", + " 'elev',\n", + " 'favorit',\n", + " 'danddncgirl',\n", + " 'sorri',\n", + " 'hear',\n", + " 'part',\n", + " 'phx',\n", + " 'rel',\n", + " 'dmosley',\n", + " 'goingbto',\n", + " 'ohsailor',\n", + " 'sri',\n", + " 'bb',\n", + " 'chrissyjohnson',\n", + " 'jacki',\n", + " 'fair',\n", + " 'tomm',\n", + " 'sunday',\n", + " 'rachaelxxo',\n", + " 'oooh',\n", + " 'question',\n", + " 'mark',\n", + " 'quit',\n", + " 'pick',\n", + " 'aah',\n", + " 'annoy',\n", + " 'throat',\n", + " 'seandonaho',\n", + " 'ad',\n", + " 'sergverdi',\n", + " 'lesli',\n", + " 'vfcst',\n", + " 'ouchh',\n", + " 'yea',\n", + " 'aim',\n", + " 'talk',\n", + " 'mee',\n", + " 'laptop',\n", + " 'freak',\n", + " 'piss',\n", + " 'jerk',\n", + " 'heart',\n", + " 'yank',\n", + " 'bryanlyt',\n", + " 'certain',\n", + " 'area',\n", + " 'utama',\n", + " 'kota',\n", + " 'dsara',\n", + " 'mayhemmil',\n", + " 'elli',\n", + " 'defin',\n", + " 'faction',\n", + " 'hell',\n", + " 'son',\n", + " 'cat',\n", + " 'unpack',\n", + " 'miklo',\n", + " 'victori',\n", + " 'park',\n", + " 'twilight',\n", + " 'lineup',\n", + " 'bummer',\n", + " 'proud',\n", + " 'kill',\n", + " 'thursday',\n", + " 'aye',\n", + " 'xcspeed',\n", + " 'south',\n", + " 'padr',\n", + " 'island',\n", + " 'tommorow',\n", + " 'schlitterban',\n", + " 'ali',\n", + " 'davi',\n", + " 'lucki',\n", + " 'student',\n", + " 'teach',\n", + " 'point',\n", + " 'ncheck',\n", + " 'servic',\n", + " 'number',\n", + " 'gb',\n", + " 'gorgeou',\n", + " 'weather',\n", + " 'bike',\n", + " 'london',\n", + " 'babi',\n", + " 'refil',\n", + " 'bttle',\n", + " 'gym',\n", + " 'fountain',\n", + " 'spilt',\n", + " 'evrywhr',\n", + " 'embarress',\n", + " 'controversi',\n", + " 'link',\n", + " 'privat',\n", + " 'wahhh',\n", + " 'restart',\n", + " 'heheh',\n", + " 'jonasbroth',\n", + " 'bf',\n", + " 'weird',\n", + " 'let',\n", + " 'updat',\n", + " 'follow',\n", + " 'deathli',\n", + " 'hangov',\n", + " 'morn',\n", + " 'puke',\n", + " 'zorb',\n", + " 'buffet',\n", + " 'swag',\n", + " 'bag',\n", + " 'panten',\n", + " 'shoppen',\n", + " 'toll',\n", + " 'un',\n", + " 'tina',\n", + " 'getroffen',\n", + " 'yez',\n", + " 'pc',\n", + " 'gammeln',\n", + " 'chakatsunstreak',\n", + " 'small',\n", + " 'math',\n", + " 'yes',\n", + " 'horribl',\n", + " 'excit',\n", + " 'microsoft',\n", + " 'confer',\n", + " 'late',\n", + " 'awesom',\n", + " 'afro',\n", + " 'ffxiii',\n", + " 'w',\n", + " 'shelley',\n", + " 'airport',\n", + " 'nephew',\n", + " 'kat',\n", + " 'dentist',\n", + " 'brace',\n", + " 'monicaa',\n", + " 'sengupta',\n", + " 'krist',\n", + " 'ph',\n", + " 'r',\n", + " 'repeat',\n", + " 'parentstud',\n", + " 'marcolaureano',\n", + " 'welcom',\n", + " 'non',\n", + " 'stop',\n", + " 'tweet',\n", + " 'johnkuan',\n", + " 'pronaz',\n", + " 'raj',\n", + " 'lebron',\n", + " 'v',\n", + " 'kobe',\n", + " 'least',\n", + " 'yanke',\n", + " 'gah',\n", + " 'mimic',\n", + " 'ubisoft',\n", + " 'littl',\n", + " 'orient',\n", + " 'eminem',\n", + " 'track',\n", + " 'formula',\n", + " 'seriou',\n", + " 'arf',\n", + " 'guinea',\n", + " 'pig',\n", + " 'dine',\n", + " 'noisi',\n", + " 'doggi',\n", + " 'ryke',\n", + " 'whether',\n", + " 'includ',\n", + " 'straight',\n", + " 'latest',\n", + " 'report',\n", + " 'surviv',\n", + " 'recess',\n", + " 'ask',\n", + " 'www',\n", + " 'bgacceler',\n", + " 'episod',\n", + " 'simpson',\n", + " 'pollinatewildli',\n", + " 'catalyt',\n", + " 'convert',\n", + " 'broken',\n", + " 'cost',\n", + " 'sold',\n", + " 'max',\n", + " 'settin',\n", + " 'websit',\n", + " 'whoa',\n", + " 'ethansuple',\n", + " 'boy',\n", + " 'freakin',\n", + " 'strong',\n", + " 'weakest',\n", + " 'rorzshach',\n", + " 'devon',\n", + " 'hahaha',\n", + " 'freed',\n", + " 'prison',\n", + " 'ah',\n", + " 'freedom',\n", + " 'princesssuperc',\n", + " 'shoot',\n", + " 'delboy',\n", + " 'promot',\n", + " 'doe',\n", + " 'lindasmith',\n", + " 'pharmaci',\n", + " 'dalla',\n", + " 'hawaii',\n", + " 'friday',\n", + " 'cowboyhazel',\n", + " 'ok',\n", + " 'tip',\n", + " 'plurk',\n", + " 'hellasia',\n", + " 'stu',\n", + " 'gg',\n", + " 'young',\n", + " 'world',\n", + " 'foot',\n", + " 'killin',\n", + " 'name',\n", + " 'better',\n", + " 'ahead',\n", + " 'ashleebiscuit',\n", + " 'fell',\n", + " 'fring',\n", + " 'hang',\n", + " 'puppi',\n", + " 'pakc',\n", + " 'texa',\n", + " 'mission',\n", + " 'jessemccartney',\n", + " 'ughhh',\n", + " 'c',\n", + " 'sooo',\n", + " 'longer',\n", + " 'term',\n", + " 'trick',\n", + " 'jennybdesign',\n", + " 'correct',\n", + " 'goe',\n", + " 'public',\n", + " 'lottaburg',\n", + " 'jam',\n", + " 'fireflight',\n", + " ...]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "cfdist = nltk.FreqDist()\n", + "\n", + "for tweet in sample['text_processed']:\n", + " for word in tweet:\n", + " cfdist[word] += 1\n", + "\n", + "top_words = list(cfdist.keys())[:5000]\n", + "top_words" ] }, { @@ -167,11 +1469,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20000\n" + ] + } + ], "source": [ - "# your code here" + "def find_features(document):\n", + " words = set(document)\n", + " features = {}\n", + " for w in top_words:\n", + " features[w] = (w in words)\n", + " \n", + " return features\n", + " \n", + "feature_sets = [(find_features(tweet), target) for (tweet, target) in list(zip(sample['text_processed'], sample['target']))]\n", + "print(len(feature_sets))" ] }, { @@ -210,11 +1529,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "train_set, test_set = feature_sets[:10000], feature_sets[10000:]\n", + "classifier = nltk.NaiveBayesClassifier.train(train_set)" ] }, { @@ -230,11 +1550,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7162\n", + "Most Informative Features\n", + " sad = True 0 : 1 = 22.1 : 1.0\n", + " sick = True 0 : 1 = 16.5 : 1.0\n", + " headach = True 0 : 1 = 15.8 : 1.0\n", + " bum = True 0 : 1 = 13.3 : 1.0\n", + " hospit = True 0 : 1 = 12.7 : 1.0\n" + ] + } + ], "source": [ - "# your code here" + "print(nltk.classify.accuracy(classifier, test_set))\n", + "classifier.show_most_informative_features(5)" ] }, { @@ -298,7 +1633,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -312,7 +1647,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.4" } }, "nbformat": 4,