From 4073d1ab3ae4cb405a6f19d377ac563fdfb9eb90 Mon Sep 17 00:00:00 2001 From: Francisco Barreto Date: Mon, 8 Aug 2022 15:11:06 +0100 Subject: [PATCH] lab-nlp --- your-code/challenge-1.ipynb | 138 +++- your-code/challenge-2.ipynb | 1421 ++++++++++++++++++++++++++++++++++- 2 files changed, 1532 insertions(+), 27 deletions(-) diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..e15304e 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,9 +66,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], + "source": [ + "import re \n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ironhack s q website is'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def clean_up(s):\n", " \"\"\"\n", @@ -79,7 +100,14 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " string= re.sub(r'http\\S+','',s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + " \n", + "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n", + "\n", + "test_string = clean_up(test)\n", + "test_string" ] }, { @@ -101,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +142,29 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " return nltk.word_tokenize(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokens = tokenize(test_string)\n", + "tokens" ] }, { @@ -145,7 +195,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -158,7 +218,37 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l2 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " l2 += [s]\n", + " \n", + " return l2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'websit', 'is']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stem_and_lemmatize(tokens)" ] }, { @@ -176,10 +266,19 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r n h c k q w e b e \n" + ] + } + ], "source": [ + "from nltk.corpus import stopwords\n", "def remove_stopwords(l):\n", " \"\"\"\n", " Remove English stopwords from a list of strings.\n", @@ -189,9 +288,21 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + "\n", + " return ' '.join([w for w in l if w not in stop_words])\n", + "\n", + "print(remove_stopwords(test_string))" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -204,7 +315,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.12 ('base')", "language": "python", "name": "python3" }, @@ -218,7 +329,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186" + } } }, "nbformat": 4, diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..5f5bb5a 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -18,8 +18,8 @@ "\n", "```python\n", ">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", - ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
", - "
", + ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
\n", + "
\n", "Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n", ">>> analyzer = SentimentIntensityAnalyzer()\n", ">>> analyzer.polarity_scores(txt)\n", @@ -46,11 +46,344 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "import pandas as pd\n", + "import numpy as np\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "import nltk\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from nltk.probability import ConditionalFreqDist" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C:/Users/franc/Desktop/Labs Iron Hack/Last labs 16.07.2022/lab-nlp/training_dataset.csv\n" + ] + } + ], + "source": [ + "import tkinter as tk\n", + "from tkinter.filedialog import askopenfilename\n", + "import pandas as pd\n", + "\n", + "root = tk.Tk()\n", + "root.withdraw() #Prevents the Tkinter window to come up\n", + "exlpath = askopenfilename()\n", + "root.destroy()\n", + "print(exlpath)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "def clean_up(s):\n", + " \"\"\"\n", + " Cleans up numbers, URLs, and special characters from a string.\n", + "\n", + " Args:\n", + " s: The string to be cleaned up.\n", + "\n", + " Returns:\n", + " A string that has been cleaned up.\n", + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + "\n", + "def tokenize(s):\n", + " \"\"\"\n", + " Tokenize a string.\n", + "\n", + " Args:\n", + " s: String to be tokenized.\n", + "\n", + " Returns:\n", + " A list of words as the result of tokenization.\n", + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "def stem_and_lemmatize(l):\n", + " \n", + " \"\"\"\n", + " Perform stemming and lemmatization on a list of words.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after being stemmed and lemmatized.\n", + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l2 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " l2 += [s]\n", + " \n", + " return l2\n", + "\n", + "\n", + "def remove_stopwords(l):\n", + " \"\"\"\n", + " Remove English stopwords from a list of strings.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after stop words are removed.\n", + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + "\n", + " return [w for w in l if w not in stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "tweets = pd.read_csv(exlpath, engine='python',encoding=\"ISO-8859-1\",names=['Index_0', 'DATE', 'QUERY', 'User','text'], header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Index_0DATEQUERYUsertext
01467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
01467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
01467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
01467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
01467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", + "
" + ], + "text/plain": [ + " Index_0 DATE QUERY User \\\n", + "0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \n", + "0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", + "0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", + "0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n", + "0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n", + "\n", + " text \n", + "0 @switchfoot http://twitpic.com/2y1zl - Awww, t... \n", + "0 is upset that he can't update his Facebook by ... \n", + "0 @Kenichan I dived many times for the ball. Man... \n", + "0 my whole body feels itchy and like its on fire \n", + "0 @nationwideclass no, it's not behaving at all.... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "sample = tweets.sample(20000)\n", + "sample['target'] = sample['Index_0'].replace(4, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Index_0DATEQUERYUsertexttarget
01975865841Sat May 30 16:04:18 PDT 2009NO_QUERYmartinsandovalI have so much grains... I'm scared1975865841
02298482191Tue Jun 23 11:44:01 PDT 2009NO_QUERYdrunkenscholar@LaurenWJohnston There's 1 copy and 5 requests.2298482191
01882734354Fri May 22 07:23:40 PDT 2009NO_QUERYSophieCollinssis eating maltesers, have work at 51882734354
41880764926Fri May 22 02:25:19 PDT 2009NO_QUERYmrQQI'm in love with fairytale.. and it doesnt hur...1880764926
42066608145Sun Jun 07 10:50:25 PDT 2009NO_QUERYenithhernandez.@MarcelloJun or you are just a sheeple?2066608145
\n", + "
" + ], + "text/plain": [ + " Index_0 DATE QUERY User \\\n", + "0 1975865841 Sat May 30 16:04:18 PDT 2009 NO_QUERY martinsandoval \n", + "0 2298482191 Tue Jun 23 11:44:01 PDT 2009 NO_QUERY drunkenscholar \n", + "0 1882734354 Fri May 22 07:23:40 PDT 2009 NO_QUERY SophieCollinss \n", + "4 1880764926 Fri May 22 02:25:19 PDT 2009 NO_QUERY mrQQ \n", + "4 2066608145 Sun Jun 07 10:50:25 PDT 2009 NO_QUERY enithhernandez \n", + "\n", + " text target \n", + "0 I have so much grains... I'm scared 1975865841 \n", + "0 @LaurenWJohnston There's 1 copy and 5 requests. 2298482191 \n", + "0 is eating maltesers, have work at 5 1882734354 \n", + "4 I'm in love with fairytale.. and it doesnt hur... 1880764926 \n", + "4 .@MarcelloJun or you are just a sheeple? 2066608145 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample.head()" ] }, { @@ -76,11 +409,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "sample[\"text_processed\"]= sample[\"text\"].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)" ] }, { @@ -98,11 +432,1030 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['much',\n", + " 'grain',\n", + " 'scare',\n", + " 'laurenwjohnston',\n", + " 'copi',\n", + " 'request',\n", + " 'eat',\n", + " 'maltes',\n", + " 'work',\n", + " 'love',\n", + " 'fairytal',\n", + " 'doesnt',\n", + " 'hurt',\n", + " 'marcellojun',\n", + " 'sheepl',\n", + " 'sadli',\n", + " 'chang',\n", + " 'prize',\n", + " 'sorri',\n", + " 'still',\n", + " 'conquest',\n", + " 'bad',\n", + " 'scynet',\n", + " 'thank',\n", + " 'rt',\n", + " 'mention',\n", + " 'galtim',\n", + " 'get',\n", + " 'use',\n", + " 'thi',\n", + " 'phone',\n", + " 'tri',\n", + " 'twitterfon',\n", + " 'move',\n", + " 'stuff',\n", + " 'oh',\n", + " 'miss',\n", + " 'old',\n", + " 'risarm',\n", + " 'urgh',\n", + " 'ad',\n", + " 'tv',\n", + " 'australia',\n", + " 'say',\n", + " 'quot',\n", + " 'u',\n", + " 'dont',\n", + " 'like',\n", + " 'chicken',\n", + " 'someth',\n", + " 'wrong',\n", + " 'disgust',\n", + " 'spent',\n", + " 'last',\n", + " 'night',\n", + " 'clean',\n", + " 'fish',\n", + " 'tank',\n", + " 'morn',\n", + " 'mop',\n", + " 'overnight',\n", + " 'leak',\n", + " 'spend',\n", + " 'next',\n", + " 'hour',\n", + " 'buy',\n", + " 'new',\n", + " 'nbc',\n", + " 'even',\n", + " 'read',\n", + " 'stori',\n", + " 'thnx',\n", + " 'follow',\n", + " 'alauderdal',\n", + " 'gurumonet',\n", + " 'bum',\n", + " 'three',\n", + " 'plan',\n", + " 'got',\n", + " 'cancel',\n", + " 'today',\n", + " 'urg',\n", + " 'steve',\n", + " 'job',\n", + " 'liver',\n", + " 'transplant',\n", + " 'doe',\n", + " 'mean',\n", + " 'metastat',\n", + " 'cancer',\n", + " 'appl',\n", + " 'ha',\n", + " 'workingipod',\n", + " 'lie',\n", + " 'around',\n", + " 'longer',\n", + " 'borrow',\n", + " 'neither',\n", + " 'mine',\n", + " 'order',\n", + " 'monday',\n", + " 'bedtim',\n", + " 'goodnight',\n", + " 'world',\n", + " 'chynnedol',\n", + " 'aww',\n", + " 'turn',\n", + " 'tire',\n", + " 'car',\n", + " 'littl',\n", + " 'recruit',\n", + " 'help',\n", + " 'want',\n", + " 'lauradunn',\n", + " 'didnt',\n", + " 'find',\n", + " 'wud',\n", + " 'dr',\n", + " 'way',\n", + " 'im',\n", + " 'gettin',\n", + " 'bk',\n", + " 'nd',\n", + " 'pic',\n", + " 'ye',\n", + " 'ciara',\n", + " 'ipod',\n", + " 'wa',\n", + " 'b',\n", + " 'majordodson',\n", + " 'tweet',\n", + " 'lion',\n", + " 'bar',\n", + " 'breakfast',\n", + " 'cereal',\n", + " 'amaz',\n", + " 'whi',\n", + " 'uk',\n", + " 'omg',\n", + " 'hay',\n", + " 'fever',\n", + " 'aw',\n", + " 'year',\n", + " 'lost',\n", + " 'without',\n", + " 'sa',\n", + " 'whole',\n", + " 'week',\n", + " 'ani',\n", + " 'done',\n", + " 'nice',\n", + " 'weather',\n", + " 'glastonburi',\n", + " 'right',\n", + " 'mayb',\n", + " 'ohh',\n", + " 'peanut',\n", + " 'butter',\n", + " 'chocol',\n", + " 'ice',\n", + " 'cream',\n", + " 'plea',\n", + " 'adrianaalv',\n", + " 'itsjavin',\n", + " 'sowwi',\n", + " 'answer',\n", + " 'outta',\n", + " 'text',\n", + " 'rememb',\n", + " 'look',\n", + " 'zhe',\n", + " 'jacket',\n", + " 'haha',\n", + " 'skriptkeep',\n", + " 'yup',\n", + " 'rlyddsn',\n", + " 'know',\n", + " 'realli',\n", + " 'need',\n", + " 'wake',\n", + " 'becaus',\n", + " 'lonley',\n", + " 'back',\n", + " 'later',\n", + " 'batteri',\n", + " 'go',\n", + " 'bye',\n", + " 'watch',\n", + " 'smallvil',\n", + " 'studi',\n", + " 'math',\n", + " 'fun',\n", + " 'byeeeeeeeee',\n", + " 'left',\n", + " 'offic',\n", + " 'lol',\n", + " 'herecomesdomzi',\n", + " 'huh',\n", + " 'cri',\n", + " 'forget',\n", + " 'baaaaackach',\n", + " 'cut',\n", + " 'fabric',\n", + " 'show',\n", + " 'long',\n", + " 'goodby',\n", + " 'katharot',\n", + " 'one',\n", + " 'twitter',\n", + " 'hang',\n", + " 'ryan',\n", + " 'v',\n", + " 'card',\n", + " 'mirror',\n", + " 'windbrisk',\n", + " 'overrun',\n", + " 'topdeck',\n", + " 'wrath',\n", + " 'take',\n", + " 'home',\n", + " 'cant',\n", + " 'think',\n", + " 'w',\n", + " 'head',\n", + " 'effin',\n", + " 'allergi',\n", + " 'ugh',\n", + " 'anyway',\n", + " 'check',\n", + " 'mscaseycart',\n", + " 'blogspot',\n", + " 'com',\n", + " 'ya',\n", + " 'claireliz',\n", + " 'hungri',\n", + " 'mtv',\n", + " 'movi',\n", + " 'award',\n", + " 'tonight',\n", + " 'ta',\n", + " 'tomorrow',\n", + " 'start',\n", + " 'handbal',\n", + " 'om',\n", + " 'dure',\n", + " 'carrot',\n", + " 'girl',\n", + " 'definit',\n", + " 'kid',\n", + " 'melissa',\n", + " 'debbiejjohnson',\n", + " 'learn',\n", + " 'rule',\n", + " 'desk',\n", + " 'gon',\n", + " 'na',\n", + " 'ahhh',\n", + " 'offici',\n", + " 'sad',\n", + " 'carriebeth',\n", + " 'ace',\n", + " 'told',\n", + " 'sound',\n", + " 'free',\n", + " 'unlimit',\n", + " 'internet',\n", + " 'magic',\n", + " 'word',\n", + " 'dad',\n", + " 'amp',\n", + " 'mom',\n", + " 'channel',\n", + " 'concern',\n", + " 'safeti',\n", + " 'twosharon',\n", + " 'call',\n", + " 'wan',\n", + " 'ztnewetnorb',\n", + " 'sure',\n", + " 'thing',\n", + " 'driven',\n", + " 'first',\n", + " 'place',\n", + " 'ride',\n", + " 'bike',\n", + " 'newcastl',\n", + " 'babi',\n", + " 'pangang',\n", + " 'friend',\n", + " 'time',\n", + " 'made',\n", + " 'vickijonasx',\n", + " 'excelentceleri',\n", + " 'sick',\n", + " 'guy',\n", + " 'reidspe',\n", + " 'awesom',\n", + " 'set',\n", + " 'alway',\n", + " 'enjoy',\n", + " 'see',\n", + " 'perform',\n", + " 'forgot',\n", + " 'put',\n", + " 'deodor',\n", + " 'shut',\n", + " 'francefalcon',\n", + " 'also',\n", + " 'cold',\n", + " 'azbeen',\n", + " 'whose',\n", + " 'skate',\n", + " 'make',\n", + " 'midasoracl',\n", + " 'coverag',\n", + " 'pre',\n", + " 'beta',\n", + " 'sleep',\n", + " 'talk',\n", + " 'kyle',\n", + " 'travel',\n", + " 'usa',\n", + " 'well',\n", + " 'princessjenn',\n", + " 'yeah',\n", + " 'guess',\n", + " 'stress',\n", + " 'max',\n", + " 'woke',\n", + " 'scream',\n", + " 'joyyy',\n", + " 'vey',\n", + " 'lay',\n", + " 'bed',\n", + " 'noth',\n", + " 'day',\n", + " 'andi',\n", + " 'ok',\n", + " 'clarissasay',\n", + " 'hahaa',\n", + " 'tell',\n", + " 'good',\n", + " 'pure',\n", + " 'protein',\n", + " 'squar',\n", + " 'pain',\n", + " 'belli',\n", + " 'listen',\n", + " 'blake',\n", + " 'shelton',\n", + " 'cld',\n", + " 'ju',\n", + " 'yack',\n", + " 'wld',\n", + " 'feel',\n", + " 'better',\n", + " 'ughhh',\n", + " 'goin',\n", + " 'feeln',\n", + " 'shit',\n", + " 'carrietari',\n", + " 'complet',\n", + " 'behind',\n", + " 'bridal',\n", + " 'magazin',\n", + " 'saw',\n", + " 'wed',\n", + " 'featur',\n", + " 'sunni',\n", + " 'rsonneyj',\n", + " 'onli',\n", + " 'godin',\n", + " 'mktng',\n", + " 'god',\n", + " 'raju',\n", + " 'raj',\n", + " 'discov',\n", + " 'ninja',\n", + " 'bunni',\n", + " 'fan',\n", + " 'art',\n", + " 'cwoolbrightjr',\n", + " 'duck',\n", + " 'abl',\n", + " 'hi',\n", + " 'talkin',\n", + " 'junk',\n", + " 'fri',\n", + " 'stop',\n", + " 'inteu',\n", + " 'valcatherin',\n", + " 'refus',\n", + " 'choir',\n", + " 'school',\n", + " 'blurgh',\n", + " 'happi',\n", + " 'xo',\n", + " 'tricksatthebar',\n", + " 'room',\n", + " 'hous',\n", + " 'adam',\n", + " 'paper',\n", + " 'went',\n", + " 'carri',\n", + " 'trespass',\n", + " 'nudg',\n", + " 'term',\n", + " 'cloudi',\n", + " 'outsid',\n", + " 'swizzlesqueak',\n", + " 'particularli',\n", + " 'male',\n", + " 'territori',\n", + " 'fierc',\n", + " 'protect',\n", + " 'anoth',\n", + " 'pretti',\n", + " 'jobless',\n", + " 'come',\n", + " 'throw',\n", + " 'desert',\n", + " 'accident',\n", + " 'ate',\n", + " 'gossip',\n", + " 'p',\n", + " 'everyon',\n", + " 'catchup',\n", + " 'weekend',\n", + " 'issu',\n", + " 'backlog',\n", + " 'manual',\n", + " 'hmmm',\n", + " 'grandson',\n", + " 'sent',\n", + " 'blackberri',\n", + " 'smartphon',\n", + " 'sprintspe',\n", + " 'kmlc',\n", + " 'charm',\n", + " 'ask',\n", + " 'nikon',\n", + " 'user',\n", + " 'l',\n", + " 'glass',\n", + " 'leav',\n", + " 'stomp',\n", + " 'huff',\n", + " 'dunno',\n", + " 'shot',\n", + " 'final',\n", + " 'yaayyy',\n", + " 'ashley',\n", + " 'tisdal',\n", + " 'germani',\n", + " 'comet',\n", + " 'hope',\n", + " 'sometim',\n", + " 'signal',\n", + " 'area',\n", + " 'felt',\n", + " 'lone',\n", + " 'moneymarv',\n", + " 'sup',\n", + " 'kuzzo',\n", + " 'yo',\n", + " 'tyme',\n", + " 'moro',\n", + " 'send',\n", + " 'wife',\n", + " 'airport',\n", + " 'ill',\n", + " 'found',\n", + " 'familiar',\n", + " 'sloooowwww',\n", + " 'fav',\n", + " 'snl',\n", + " 'wish',\n", + " 'hey',\n", + " 'deserv',\n", + " 'vacat',\n", + " 'nyc',\n", + " 'though',\n", + " 'congrat',\n", + " 'chri',\n", + " 'krisztian',\n", + " 'miszer',\n", + " 'beauti',\n", + " 'member',\n", + " 'kcsd',\n", + " 'must',\n", + " 'end',\n", + " 'yet',\n", + " 'summer',\n", + " 'bring',\n", + " 'mani',\n", + " 'cup',\n", + " 'hot',\n", + " 'tea',\n", + " 'honey',\n", + " 'could',\n", + " 'rid',\n", + " 'sore',\n", + " 'throat',\n", + " 'damn',\n", + " 'htc',\n", + " 'delay',\n", + " 'vodafon',\n", + " 'wait',\n", + " 'langer',\n", + " 'marleematlin',\n", + " 'genesimmon',\n", + " 'real',\n", + " 'alreadi',\n", + " 'followfriday',\n", + " 'itvdotcom',\n", + " 'rel',\n", + " 'emma',\n", + " 'ticket',\n", + " 'thursday',\n", + " 'wednesday',\n", + " 'play',\n", + " 'sim',\n", + " 'badli',\n", + " 'famili',\n", + " 'liam',\n", + " 'lime',\n", + " 'havent',\n", + " 'myspac',\n", + " 'half',\n", + " 'least',\n", + " 'tube',\n", + " 'finish',\n", + " 'chem',\n", + " 'part',\n", + " 'anyon',\n", + " 'idea',\n", + " 'sweetestcassi',\n", + " 'nothin',\n", + " 'werk',\n", + " 'nite',\n", + " 'cmaxcoop',\n", + " 'meant',\n", + " 'dri',\n", + " 'water',\n", + " 'giggl',\n", + " 'ever',\n", + " 'readi',\n", + " 'ladi',\n", + " 'mode',\n", + " 'shower',\n", + " 'wat',\n", + " 'happen',\n", + " 'meee',\n", + " 'krissi',\n", + " 'true',\n", + " 'somewher',\n", + " 'london',\n", + " 'manchest',\n", + " 'murder',\n", + " 'everi',\n", + " 'seem',\n", + " 'morrigan',\n", + " 'hug',\n", + " 'blfc',\n", + " 'yesterday',\n", + " 'sarmi',\n", + " 'drama',\n", + " 'calisu',\n", + " 'plant',\n", + " 'tree',\n", + " 'honor',\n", + " 'father',\n", + " 'bet',\n", + " 'busi',\n", + " 'jwilphoto',\n", + " 'keyshia',\n", + " 'truth',\n", + " 'speak',\n", + " 'concert',\n", + " 'theduncan',\n", + " 'quit',\n", + " 'confus',\n", + " 'control',\n", + " 'yay',\n", + " 'easier',\n", + " 'montanatuck',\n", + " 'hmm',\n", + " 'serious',\n", + " 'mmmm',\n", + " 'tiramisu',\n", + " 'beach',\n", + " 'haz',\n", + " 'colleg',\n", + " 'okay',\n", + " 'bird',\n", + " 'chirp',\n", + " 'attempt',\n", + " 'ish',\n", + " 'toodl',\n", + " 'juaner',\n", + " 'ahh',\n", + " 'juan',\n", + " 'dymilkshak',\n", + " 'soon',\n", + " 'tail',\n", + " 'write',\n", + " 'stupid',\n", + " 'pen',\n", + " 'took',\n", + " 'bobbytommi',\n", + " 'yeahh',\n", + " 'let',\n", + " 'msja',\n", + " 'hate',\n", + " 'group',\n", + " 'alyxxdion',\n", + " 'jenni',\n", + " 'jonasbroth',\n", + " 'liverpool',\n", + " 'co',\n", + " 'beatl',\n", + " 'live',\n", + " 'zuton',\n", + " 'dtown',\n", + " 'vanessacvaldez',\n", + " 'mexican',\n", + " 'blog',\n", + " 'never',\n", + " 'cuz',\n", + " 'lazi',\n", + " 'endlesswhimsi',\n", + " 'post',\n", + " 'boyfriend',\n", + " 'veri',\n", + " 'cycl',\n", + " 'buddi',\n", + " 'fell',\n", + " 'maddenlov',\n", + " 'understand',\n", + " 'exist',\n", + " 'aaahhhhhhhhhhhhhhhhhhhhhhhhhh',\n", + " 'nom',\n", + " 'kinda',\n", + " 'raini',\n", + " 'queesi',\n", + " 'fruster',\n", + " 'anger',\n", + " 'team',\n", + " 'staceyfreeadr',\n", + " 'stacey',\n", + " 'bobbi',\n", + " 'comment',\n", + " 'alan',\n", + " 'fwd',\n", + " 'name',\n", + " 'afternoon',\n", + " 'rather',\n", + " 'relax',\n", + " 'conserv',\n", + " 'cdwow',\n", + " 'simpl',\n", + " 'mind',\n", + " 'amazon',\n", + " 'canceld',\n", + " 'biomekkanik',\n", + " 'album',\n", + " 'vnv',\n", + " 'nation',\n", + " 'arriv',\n", + " 'tommcflyi',\n", + " 'freak',\n", + " 'repli',\n", + " 'joezor',\n", + " 'point',\n", + " 'win',\n", + " 'best',\n", + " 'excus',\n", + " 'swim',\n", + " 'mradamlambert',\n", + " 'normal',\n", + " 'would',\n", + " 'sinc',\n", + " 'isnt',\n", + " 'amon',\n", + " 'bore',\n", + " 'gypsyraven',\n", + " 'tub',\n", + " 'logi',\n", + " 'x',\n", + " 'liz',\n", + " 'tattoo',\n", + " 'cash',\n", + " 'nikkibenz',\n", + " 'jaylastarr',\n", + " 'nighti',\n", + " 'kind',\n", + " 'jennif',\n", + " 'either',\n", + " 'kentucki',\n", + " 'derbi',\n", + " 'mint',\n", + " 'julep',\n", + " 'situp',\n", + " 'km',\n", + " 'build',\n", + " 'gunz',\n", + " 'dreadkey',\n", + " 'k',\n", + " 'batter',\n", + " 'bruis',\n", + " 'arm',\n", + " 'tambourin',\n", + " 'self',\n", + " 'conflict',\n", + " 'qc',\n", + " 'great',\n", + " 'souleyede',\n", + " 'tue',\n", + " 'jame',\n", + " 'phelp',\n", + " 'hedgehog',\n", + " 'sega',\n", + " 'mega',\n", + " 'drive',\n", + " 'game',\n", + " 'almond',\n", + " 'nut',\n", + " 'furr',\n", + " 'victori',\n", + " 'ze',\n", + " 'dane',\n", + " 'bob',\n", + " 'dylan',\n", + " 'aint',\n", + " 'babe',\n", + " 'import',\n", + " 'iim',\n", + " 'late',\n", + " 'forward',\n", + " 'iphon',\n", + " 'softwar',\n", + " 'updat',\n", + " 'tallerguy',\n", + " 'sigh',\n", + " 'full',\n", + " 'proce',\n", + " 'lovin',\n", + " 'georg',\n", + " 'gina',\n", + " 'luci',\n", + " 'bag',\n", + " 'yazeez',\n", + " 'pink',\n", + " 'amyk',\n", + " 'geek',\n", + " 'label',\n", + " 'dymo',\n", + " 'labelwrit',\n", + " 'fall',\n", + " 'catagori',\n", + " 'appar',\n", + " 'moekelsak',\n", + " 'via',\n", + " 'owl',\n", + " 'save',\n", + " 'reali',\n", + " 'togeth',\n", + " 'farewel',\n", + " 'hame',\n", + " 'mtstanford',\n", + " 'kno',\n", + " 'luck',\n", + " 'goodev',\n", + " 'peopl',\n", + " 'dish',\n", + " 'si',\n", + " 'came',\n", + " 'yell',\n", + " 'camera',\n", + " 'bit',\n", + " 'blitz',\n", + " 'parent',\n", + " 'dundundun',\n", + " 'uni',\n", + " 'revis',\n", + " 'sunshine',\n", + " 'interview',\n", + " 'epicwat',\n", + " 'add',\n", + " 'heh',\n", + " 'soooo',\n", + " 'sleepi',\n", + " 'ben',\n", + " 'tommcfli',\n", + " 'pleaaaaas',\n", + " 'steffi',\n", + " 'ah',\n", + " 'almost',\n", + " 'ima',\n", + " 'lt',\n", + " 'chip',\n", + " 'mad',\n", + " 'season',\n", + " 'far',\n", + " 'flawlessli',\n", + " 'pleasebitem',\n", + " 'total',\n", + " 'fair',\n", + " 'daughter',\n", + " 'bday',\n", + " 'parti',\n", + " 'theme',\n", + " 'hello',\n", + " 'kitti',\n", + " 'cake',\n", + " 'pattygal',\n", + " 'patti',\n", + " 'agre',\n", + " 'itsashlz',\n", + " 'mmmmmwwwwaaaaaarrrrr',\n", + " 'therobotard',\n", + " 'tsnydermtg',\n", + " 'jump',\n", + " 'said',\n", + " 'layer',\n", + " 'onion',\n", + " 'approach',\n", + " 'met',\n", + " 'ef',\n", + " 'rain',\n", + " 'wear',\n", + " 'blue',\n", + " 'poncho',\n", + " 'sctape',\n", + " 'youngq',\n", + " 'gr',\n", + " 'rob',\n", + " 'mixingtp',\n", + " 'nitenit',\n", + " 'st',\n", + " 'month',\n", + " 'woo',\n", + " 'hoo',\n", + " 'june',\n", + " 'punish',\n", + " 'claireolivar',\n", + " 'moon',\n", + " 'butt',\n", + " 'grrrrrrrrr',\n", + " 'sooooooooooo',\n", + " 'darn',\n", + " 'slow',\n", + " 'stephanieellen',\n", + " 'thanx',\n", + " 'headach',\n", + " 'advic',\n", + " 'eye',\n", + " 'longest',\n", + " 'lucr',\n", + " 'career',\n", + " 'gomeztheband',\n", + " 'sort',\n", + " 'kept',\n", + " 'toe',\n", + " 'ramesrandrew',\n", + " 'interest',\n", + " 'hear',\n", + " 'handsfre',\n", + " 'bork',\n", + " 'silent',\n", + " 'hide',\n", + " 'rock',\n", + " 'nadtriadina',\n", + " 'glenanderson',\n", + " 'wors',\n", + " 'booooo',\n", + " 'earthshinedesig',\n", + " 'arduino',\n", + " 'broke',\n", + " 'atmega',\n", + " 'pack',\n", + " 'muahahahah',\n", + " 'hangov',\n", + " 'mcgiff',\n", + " 'join',\n", + " 'smile',\n", + " 'lunchtim',\n", + " 'weight',\n", + " 'except',\n", + " 'cheat',\n", + " 'margiethiel',\n", + " 'advantag',\n", + " 'beerealti',\n", + " 'dawniecahil',\n", + " 'awww',\n", + " 'hillari',\n", + " 'idk',\n", + " 'tatt',\n", + " 'wee',\n", + " 'secur',\n", + " 'jennykiwi',\n", + " 'local',\n", + " 'pet',\n", + " 'store',\n", + " 'sell',\n", + " 'cock',\n", + " 'flight',\n", + " 'slum',\n", + " 'departur',\n", + " 'loung',\n", + " 'zckenni',\n", + " 'consult',\n", + " 'recommend',\n", + " 'someon',\n", + " 'stila',\n", + " 'unverifi',\n", + " 'sourc',\n", + " 'ltte',\n", + " 'colombo',\n", + " 'investig',\n", + " 'bodi',\n", + " 'excit',\n", + " 'carm',\n", + " 'lngoeyegrl',\n", + " 'glad',\n", + " 'celebr',\n", + " 'nicolejp',\n", + " 'sumrheat',\n", + " 'decid',\n", + " 'meet',\n", + " 'backstag',\n", + " 'gether',\n", + " 'cal',\n", + " 'nalgen',\n", + " 'bottl',\n", + " 'dehydr',\n", + " 'shrivel',\n", + " 'raisin',\n", + " 'chemistri',\n", + " 'baaahh',\n", + " 'mileymonday',\n", + " 'retweet',\n", + " 'onload',\n", + " 'alert',\n", + " 'gt',\n", + " 'fail',\n", + " 'shadowsierra',\n", + " 'cage',\n", + " 'anywher',\n", + " 'boy',\n", + " 'lame',\n", + " 'men',\n", + " 'buttt',\n", + " 'lmfao',\n", + " 'tho',\n", + " 'poor',\n", + " 'sarah',\n", + " 'allieandra',\n", + " 'harriettaa',\n", + " 'cur',\n", + " 'imperi',\n", + " 'non',\n", + " 'metric',\n", + " 'system',\n", + " 'confound',\n", + " 'stay',\n", + " 'rove',\n", + " 'chariti',\n", + " 'support',\n", + " 'logoguppi',\n", + " 'john',\n", + " 'similar',\n", + " 'geni',\n", + " 'aladdin',\n", + " 'americanapparel',\n", + " 'surviv',\n", + " 'quak',\n", + " 'option',\n", + " 'deep',\n", + " 'crew',\n", + " 'neck',\n", + " 'sister',\n", + " 'collar',\n", + " 'bone',\n", + " 'broken',\n", + " 'lthagreat',\n", + " 'twonswaggcheck',\n", + " 'ohhhh',\n", + " 'digg',\n", + " 'sameerpatel',\n", + " 'product',\n", + " 'box',\n", + " 'hilari',\n", + " 'kate',\n", + " 'expect',\n", + " 'faster',\n", + " 'pacquiao',\n", + " 'fight',\n", + " 'monkaaay',\n", + " 'lil',\n", + " ...]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "count_freq= nltk.FreqDist()\n", + "\n", + "for tweet in sample[\"text_processed\"]:\n", + " for word in tweet:\n", + " count_freq[word]+=1\n", + "\n", + "top_5k_words= list(count_freq.keys())[:5000]\n", + "top_5k_words" ] }, { @@ -167,11 +1520,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20000\n" + ] + } + ], "source": [ - "# your code here" + "\n", + "# your code here\n", + "def find_features(document):\n", + " words = set(document)\n", + " features = {}\n", + " for w in top_5k_words:\n", + " features[w] = (w in words)\n", + " return features\n", + "\n", + "features_set= [(find_features(tweet), target) for (tweet, target) in list(zip(sample['text_processed'], sample['target']))]\n", + "print(len(features_set))" ] }, { @@ -210,11 +1581,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "\n", + "from textblob.classifiers import NaiveBayesClassifier\n", + "train_set,test_set = features_set[:10000],features_set[10000:]\n", + "classifier = nltk.NaiveBayesClassifier.train(train_set)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier.show_most_informative_features()" ] }, { @@ -298,7 +1682,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.12 ('base')", "language": "python", "name": "python3" }, @@ -312,7 +1696,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186" + } } }, "nbformat": 4,