diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..e15304e 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,9 +66,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
+ "source": [
+ "import re \n",
+ "import nltk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ironhack s q website is'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def clean_up(s):\n",
" \"\"\"\n",
@@ -79,7 +100,14 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " string= re.sub(r'http\\S+','',s)\n",
+ " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+ " \n",
+ "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n",
+ "\n",
+ "test_string = clean_up(test)\n",
+ "test_string"
]
},
{
@@ -101,7 +129,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -114,7 +142,29 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " return nltk.word_tokenize(s)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'website', 'is']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tokens = tokenize(test_string)\n",
+ "tokens"
]
},
{
@@ -145,7 +195,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.stem import WordNetLemmatizer\n",
+ "from nltk.stem import PorterStemmer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -158,7 +218,37 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " ps = nltk.PorterStemmer()\n",
+ " lemmatizer = nltk.WordNetLemmatizer()\n",
+ " l2 = []\n",
+ " \n",
+ " for w in l:\n",
+ " s = ps.stem(w)\n",
+ " s = lemmatizer.lemmatize(s)\n",
+ " l2 += [s]\n",
+ " \n",
+ " return l2\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'websit', 'is']"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "stem_and_lemmatize(tokens)"
]
},
{
@@ -176,10 +266,19 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "r n h c k q w e b e \n"
+ ]
+ }
+ ],
"source": [
+ "from nltk.corpus import stopwords\n",
"def remove_stopwords(l):\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
@@ -189,9 +288,21 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " stop_words = stopwords.words('english')\n",
+ "\n",
+ " return ' '.join([w for w in l if w not in stop_words])\n",
+ "\n",
+ "print(remove_stopwords(test_string))"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -204,7 +315,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3.9.12 ('base')",
"language": "python",
"name": "python3"
},
@@ -218,7 +329,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.9.12"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
+ }
}
},
"nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..5f5bb5a 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -18,8 +18,8 @@
"\n",
"```python\n",
">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
- ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide.
",
- "
",
+ ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide.
\n",
+ "
\n",
"Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n",
">>> analyzer = SentimentIntensityAnalyzer()\n",
">>> analyzer.polarity_scores(txt)\n",
@@ -46,11 +46,344 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from nltk.corpus import stopwords\n",
+ "import re\n",
+ "import nltk\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from nltk.probability import ConditionalFreqDist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "C:/Users/franc/Desktop/Labs Iron Hack/Last labs 16.07.2022/lab-nlp/training_dataset.csv\n"
+ ]
+ }
+ ],
+ "source": [
+ "import tkinter as tk\n",
+ "from tkinter.filedialog import askopenfilename\n",
+ "import pandas as pd\n",
+ "\n",
+ "root = tk.Tk()\n",
+ "root.withdraw() #Prevents the Tkinter window to come up\n",
+ "exlpath = askopenfilename()\n",
+ "root.destroy()\n",
+ "print(exlpath)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# your code here\n",
+ "def clean_up(s):\n",
+ " \"\"\"\n",
+ " Cleans up numbers, URLs, and special characters from a string.\n",
+ "\n",
+ " Args:\n",
+ " s: The string to be cleaned up.\n",
+ "\n",
+ " Returns:\n",
+ " A string that has been cleaned up.\n",
+ " \"\"\"\n",
+ " string = re.sub(r'http\\S+', '', s)\n",
+ " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+ "\n",
+ "def tokenize(s):\n",
+ " \"\"\"\n",
+ " Tokenize a string.\n",
+ "\n",
+ " Args:\n",
+ " s: String to be tokenized.\n",
+ "\n",
+ " Returns:\n",
+ " A list of words as the result of tokenization.\n",
+ " \"\"\"\n",
+ " return nltk.word_tokenize(s)\n",
+ "\n",
+ "def stem_and_lemmatize(l):\n",
+ " \n",
+ " \"\"\"\n",
+ " Perform stemming and lemmatization on a list of words.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after being stemmed and lemmatized.\n",
+ " \"\"\"\n",
+ " ps = nltk.PorterStemmer()\n",
+ " lemmatizer = nltk.WordNetLemmatizer()\n",
+ " l2 = []\n",
+ " \n",
+ " for w in l:\n",
+ " s = ps.stem(w)\n",
+ " s = lemmatizer.lemmatize(s)\n",
+ " l2 += [s]\n",
+ " \n",
+ " return l2\n",
+ "\n",
+ "\n",
+ "def remove_stopwords(l):\n",
+ " \"\"\"\n",
+ " Remove English stopwords from a list of strings.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after stop words are removed.\n",
+ " \"\"\"\n",
+ " stop_words = stopwords.words('english')\n",
+ "\n",
+ " return [w for w in l if w not in stop_words]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweets = pd.read_csv(exlpath, engine='python',encoding=\"ISO-8859-1\",names=['Index_0', 'DATE', 'QUERY', 'User','text'], header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index_0 | \n",
+ " DATE | \n",
+ " QUERY | \n",
+ " User | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1467810369 | \n",
+ " Mon Apr 06 22:19:45 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _TheSpecialOne_ | \n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, t... | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 1467811184 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElleCTF | \n",
+ " my whole body feels itchy and like its on fire | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 1467811193 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Karoli | \n",
+ " @nationwideclass no, it's not behaving at all.... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index_0 DATE QUERY User \\\n",
+ "0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \n",
+ "0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
+ "0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
+ "0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
+ "0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
+ "\n",
+ " text \n",
+ "0 @switchfoot http://twitpic.com/2y1zl - Awww, t... \n",
+ "0 is upset that he can't update his Facebook by ... \n",
+ "0 @Kenichan I dived many times for the ball. Man... \n",
+ "0 my whole body feels itchy and like its on fire \n",
+ "0 @nationwideclass no, it's not behaving at all.... "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweets.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample = tweets.sample(20000)\n",
+ "sample['target'] = sample['Index_0'].replace(4, 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index_0 | \n",
+ " DATE | \n",
+ " QUERY | \n",
+ " User | \n",
+ " text | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1975865841 | \n",
+ " Sat May 30 16:04:18 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " martinsandoval | \n",
+ " I have so much grains... I'm scared | \n",
+ " 1975865841 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 2298482191 | \n",
+ " Tue Jun 23 11:44:01 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " drunkenscholar | \n",
+ " @LaurenWJohnston There's 1 copy and 5 requests. | \n",
+ " 2298482191 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 1882734354 | \n",
+ " Fri May 22 07:23:40 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " SophieCollinss | \n",
+ " is eating maltesers, have work at 5 | \n",
+ " 1882734354 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1880764926 | \n",
+ " Fri May 22 02:25:19 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mrQQ | \n",
+ " I'm in love with fairytale.. and it doesnt hur... | \n",
+ " 1880764926 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2066608145 | \n",
+ " Sun Jun 07 10:50:25 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " enithhernandez | \n",
+ " .@MarcelloJun or you are just a sheeple? | \n",
+ " 2066608145 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index_0 DATE QUERY User \\\n",
+ "0 1975865841 Sat May 30 16:04:18 PDT 2009 NO_QUERY martinsandoval \n",
+ "0 2298482191 Tue Jun 23 11:44:01 PDT 2009 NO_QUERY drunkenscholar \n",
+ "0 1882734354 Fri May 22 07:23:40 PDT 2009 NO_QUERY SophieCollinss \n",
+ "4 1880764926 Fri May 22 02:25:19 PDT 2009 NO_QUERY mrQQ \n",
+ "4 2066608145 Sun Jun 07 10:50:25 PDT 2009 NO_QUERY enithhernandez \n",
+ "\n",
+ " text target \n",
+ "0 I have so much grains... I'm scared 1975865841 \n",
+ "0 @LaurenWJohnston There's 1 copy and 5 requests. 2298482191 \n",
+ "0 is eating maltesers, have work at 5 1882734354 \n",
+ "4 I'm in love with fairytale.. and it doesnt hur... 1880764926 \n",
+ "4 .@MarcelloJun or you are just a sheeple? 2066608145 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample.head()"
]
},
{
@@ -76,11 +409,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "# your code here\n",
+ "sample[\"text_processed\"]= sample[\"text\"].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)"
]
},
{
@@ -98,11 +432,1030 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['much',\n",
+ " 'grain',\n",
+ " 'scare',\n",
+ " 'laurenwjohnston',\n",
+ " 'copi',\n",
+ " 'request',\n",
+ " 'eat',\n",
+ " 'maltes',\n",
+ " 'work',\n",
+ " 'love',\n",
+ " 'fairytal',\n",
+ " 'doesnt',\n",
+ " 'hurt',\n",
+ " 'marcellojun',\n",
+ " 'sheepl',\n",
+ " 'sadli',\n",
+ " 'chang',\n",
+ " 'prize',\n",
+ " 'sorri',\n",
+ " 'still',\n",
+ " 'conquest',\n",
+ " 'bad',\n",
+ " 'scynet',\n",
+ " 'thank',\n",
+ " 'rt',\n",
+ " 'mention',\n",
+ " 'galtim',\n",
+ " 'get',\n",
+ " 'use',\n",
+ " 'thi',\n",
+ " 'phone',\n",
+ " 'tri',\n",
+ " 'twitterfon',\n",
+ " 'move',\n",
+ " 'stuff',\n",
+ " 'oh',\n",
+ " 'miss',\n",
+ " 'old',\n",
+ " 'risarm',\n",
+ " 'urgh',\n",
+ " 'ad',\n",
+ " 'tv',\n",
+ " 'australia',\n",
+ " 'say',\n",
+ " 'quot',\n",
+ " 'u',\n",
+ " 'dont',\n",
+ " 'like',\n",
+ " 'chicken',\n",
+ " 'someth',\n",
+ " 'wrong',\n",
+ " 'disgust',\n",
+ " 'spent',\n",
+ " 'last',\n",
+ " 'night',\n",
+ " 'clean',\n",
+ " 'fish',\n",
+ " 'tank',\n",
+ " 'morn',\n",
+ " 'mop',\n",
+ " 'overnight',\n",
+ " 'leak',\n",
+ " 'spend',\n",
+ " 'next',\n",
+ " 'hour',\n",
+ " 'buy',\n",
+ " 'new',\n",
+ " 'nbc',\n",
+ " 'even',\n",
+ " 'read',\n",
+ " 'stori',\n",
+ " 'thnx',\n",
+ " 'follow',\n",
+ " 'alauderdal',\n",
+ " 'gurumonet',\n",
+ " 'bum',\n",
+ " 'three',\n",
+ " 'plan',\n",
+ " 'got',\n",
+ " 'cancel',\n",
+ " 'today',\n",
+ " 'urg',\n",
+ " 'steve',\n",
+ " 'job',\n",
+ " 'liver',\n",
+ " 'transplant',\n",
+ " 'doe',\n",
+ " 'mean',\n",
+ " 'metastat',\n",
+ " 'cancer',\n",
+ " 'appl',\n",
+ " 'ha',\n",
+ " 'workingipod',\n",
+ " 'lie',\n",
+ " 'around',\n",
+ " 'longer',\n",
+ " 'borrow',\n",
+ " 'neither',\n",
+ " 'mine',\n",
+ " 'order',\n",
+ " 'monday',\n",
+ " 'bedtim',\n",
+ " 'goodnight',\n",
+ " 'world',\n",
+ " 'chynnedol',\n",
+ " 'aww',\n",
+ " 'turn',\n",
+ " 'tire',\n",
+ " 'car',\n",
+ " 'littl',\n",
+ " 'recruit',\n",
+ " 'help',\n",
+ " 'want',\n",
+ " 'lauradunn',\n",
+ " 'didnt',\n",
+ " 'find',\n",
+ " 'wud',\n",
+ " 'dr',\n",
+ " 'way',\n",
+ " 'im',\n",
+ " 'gettin',\n",
+ " 'bk',\n",
+ " 'nd',\n",
+ " 'pic',\n",
+ " 'ye',\n",
+ " 'ciara',\n",
+ " 'ipod',\n",
+ " 'wa',\n",
+ " 'b',\n",
+ " 'majordodson',\n",
+ " 'tweet',\n",
+ " 'lion',\n",
+ " 'bar',\n",
+ " 'breakfast',\n",
+ " 'cereal',\n",
+ " 'amaz',\n",
+ " 'whi',\n",
+ " 'uk',\n",
+ " 'omg',\n",
+ " 'hay',\n",
+ " 'fever',\n",
+ " 'aw',\n",
+ " 'year',\n",
+ " 'lost',\n",
+ " 'without',\n",
+ " 'sa',\n",
+ " 'whole',\n",
+ " 'week',\n",
+ " 'ani',\n",
+ " 'done',\n",
+ " 'nice',\n",
+ " 'weather',\n",
+ " 'glastonburi',\n",
+ " 'right',\n",
+ " 'mayb',\n",
+ " 'ohh',\n",
+ " 'peanut',\n",
+ " 'butter',\n",
+ " 'chocol',\n",
+ " 'ice',\n",
+ " 'cream',\n",
+ " 'plea',\n",
+ " 'adrianaalv',\n",
+ " 'itsjavin',\n",
+ " 'sowwi',\n",
+ " 'answer',\n",
+ " 'outta',\n",
+ " 'text',\n",
+ " 'rememb',\n",
+ " 'look',\n",
+ " 'zhe',\n",
+ " 'jacket',\n",
+ " 'haha',\n",
+ " 'skriptkeep',\n",
+ " 'yup',\n",
+ " 'rlyddsn',\n",
+ " 'know',\n",
+ " 'realli',\n",
+ " 'need',\n",
+ " 'wake',\n",
+ " 'becaus',\n",
+ " 'lonley',\n",
+ " 'back',\n",
+ " 'later',\n",
+ " 'batteri',\n",
+ " 'go',\n",
+ " 'bye',\n",
+ " 'watch',\n",
+ " 'smallvil',\n",
+ " 'studi',\n",
+ " 'math',\n",
+ " 'fun',\n",
+ " 'byeeeeeeeee',\n",
+ " 'left',\n",
+ " 'offic',\n",
+ " 'lol',\n",
+ " 'herecomesdomzi',\n",
+ " 'huh',\n",
+ " 'cri',\n",
+ " 'forget',\n",
+ " 'baaaaackach',\n",
+ " 'cut',\n",
+ " 'fabric',\n",
+ " 'show',\n",
+ " 'long',\n",
+ " 'goodby',\n",
+ " 'katharot',\n",
+ " 'one',\n",
+ " 'twitter',\n",
+ " 'hang',\n",
+ " 'ryan',\n",
+ " 'v',\n",
+ " 'card',\n",
+ " 'mirror',\n",
+ " 'windbrisk',\n",
+ " 'overrun',\n",
+ " 'topdeck',\n",
+ " 'wrath',\n",
+ " 'take',\n",
+ " 'home',\n",
+ " 'cant',\n",
+ " 'think',\n",
+ " 'w',\n",
+ " 'head',\n",
+ " 'effin',\n",
+ " 'allergi',\n",
+ " 'ugh',\n",
+ " 'anyway',\n",
+ " 'check',\n",
+ " 'mscaseycart',\n",
+ " 'blogspot',\n",
+ " 'com',\n",
+ " 'ya',\n",
+ " 'claireliz',\n",
+ " 'hungri',\n",
+ " 'mtv',\n",
+ " 'movi',\n",
+ " 'award',\n",
+ " 'tonight',\n",
+ " 'ta',\n",
+ " 'tomorrow',\n",
+ " 'start',\n",
+ " 'handbal',\n",
+ " 'om',\n",
+ " 'dure',\n",
+ " 'carrot',\n",
+ " 'girl',\n",
+ " 'definit',\n",
+ " 'kid',\n",
+ " 'melissa',\n",
+ " 'debbiejjohnson',\n",
+ " 'learn',\n",
+ " 'rule',\n",
+ " 'desk',\n",
+ " 'gon',\n",
+ " 'na',\n",
+ " 'ahhh',\n",
+ " 'offici',\n",
+ " 'sad',\n",
+ " 'carriebeth',\n",
+ " 'ace',\n",
+ " 'told',\n",
+ " 'sound',\n",
+ " 'free',\n",
+ " 'unlimit',\n",
+ " 'internet',\n",
+ " 'magic',\n",
+ " 'word',\n",
+ " 'dad',\n",
+ " 'amp',\n",
+ " 'mom',\n",
+ " 'channel',\n",
+ " 'concern',\n",
+ " 'safeti',\n",
+ " 'twosharon',\n",
+ " 'call',\n",
+ " 'wan',\n",
+ " 'ztnewetnorb',\n",
+ " 'sure',\n",
+ " 'thing',\n",
+ " 'driven',\n",
+ " 'first',\n",
+ " 'place',\n",
+ " 'ride',\n",
+ " 'bike',\n",
+ " 'newcastl',\n",
+ " 'babi',\n",
+ " 'pangang',\n",
+ " 'friend',\n",
+ " 'time',\n",
+ " 'made',\n",
+ " 'vickijonasx',\n",
+ " 'excelentceleri',\n",
+ " 'sick',\n",
+ " 'guy',\n",
+ " 'reidspe',\n",
+ " 'awesom',\n",
+ " 'set',\n",
+ " 'alway',\n",
+ " 'enjoy',\n",
+ " 'see',\n",
+ " 'perform',\n",
+ " 'forgot',\n",
+ " 'put',\n",
+ " 'deodor',\n",
+ " 'shut',\n",
+ " 'francefalcon',\n",
+ " 'also',\n",
+ " 'cold',\n",
+ " 'azbeen',\n",
+ " 'whose',\n",
+ " 'skate',\n",
+ " 'make',\n",
+ " 'midasoracl',\n",
+ " 'coverag',\n",
+ " 'pre',\n",
+ " 'beta',\n",
+ " 'sleep',\n",
+ " 'talk',\n",
+ " 'kyle',\n",
+ " 'travel',\n",
+ " 'usa',\n",
+ " 'well',\n",
+ " 'princessjenn',\n",
+ " 'yeah',\n",
+ " 'guess',\n",
+ " 'stress',\n",
+ " 'max',\n",
+ " 'woke',\n",
+ " 'scream',\n",
+ " 'joyyy',\n",
+ " 'vey',\n",
+ " 'lay',\n",
+ " 'bed',\n",
+ " 'noth',\n",
+ " 'day',\n",
+ " 'andi',\n",
+ " 'ok',\n",
+ " 'clarissasay',\n",
+ " 'hahaa',\n",
+ " 'tell',\n",
+ " 'good',\n",
+ " 'pure',\n",
+ " 'protein',\n",
+ " 'squar',\n",
+ " 'pain',\n",
+ " 'belli',\n",
+ " 'listen',\n",
+ " 'blake',\n",
+ " 'shelton',\n",
+ " 'cld',\n",
+ " 'ju',\n",
+ " 'yack',\n",
+ " 'wld',\n",
+ " 'feel',\n",
+ " 'better',\n",
+ " 'ughhh',\n",
+ " 'goin',\n",
+ " 'feeln',\n",
+ " 'shit',\n",
+ " 'carrietari',\n",
+ " 'complet',\n",
+ " 'behind',\n",
+ " 'bridal',\n",
+ " 'magazin',\n",
+ " 'saw',\n",
+ " 'wed',\n",
+ " 'featur',\n",
+ " 'sunni',\n",
+ " 'rsonneyj',\n",
+ " 'onli',\n",
+ " 'godin',\n",
+ " 'mktng',\n",
+ " 'god',\n",
+ " 'raju',\n",
+ " 'raj',\n",
+ " 'discov',\n",
+ " 'ninja',\n",
+ " 'bunni',\n",
+ " 'fan',\n",
+ " 'art',\n",
+ " 'cwoolbrightjr',\n",
+ " 'duck',\n",
+ " 'abl',\n",
+ " 'hi',\n",
+ " 'talkin',\n",
+ " 'junk',\n",
+ " 'fri',\n",
+ " 'stop',\n",
+ " 'inteu',\n",
+ " 'valcatherin',\n",
+ " 'refus',\n",
+ " 'choir',\n",
+ " 'school',\n",
+ " 'blurgh',\n",
+ " 'happi',\n",
+ " 'xo',\n",
+ " 'tricksatthebar',\n",
+ " 'room',\n",
+ " 'hous',\n",
+ " 'adam',\n",
+ " 'paper',\n",
+ " 'went',\n",
+ " 'carri',\n",
+ " 'trespass',\n",
+ " 'nudg',\n",
+ " 'term',\n",
+ " 'cloudi',\n",
+ " 'outsid',\n",
+ " 'swizzlesqueak',\n",
+ " 'particularli',\n",
+ " 'male',\n",
+ " 'territori',\n",
+ " 'fierc',\n",
+ " 'protect',\n",
+ " 'anoth',\n",
+ " 'pretti',\n",
+ " 'jobless',\n",
+ " 'come',\n",
+ " 'throw',\n",
+ " 'desert',\n",
+ " 'accident',\n",
+ " 'ate',\n",
+ " 'gossip',\n",
+ " 'p',\n",
+ " 'everyon',\n",
+ " 'catchup',\n",
+ " 'weekend',\n",
+ " 'issu',\n",
+ " 'backlog',\n",
+ " 'manual',\n",
+ " 'hmmm',\n",
+ " 'grandson',\n",
+ " 'sent',\n",
+ " 'blackberri',\n",
+ " 'smartphon',\n",
+ " 'sprintspe',\n",
+ " 'kmlc',\n",
+ " 'charm',\n",
+ " 'ask',\n",
+ " 'nikon',\n",
+ " 'user',\n",
+ " 'l',\n",
+ " 'glass',\n",
+ " 'leav',\n",
+ " 'stomp',\n",
+ " 'huff',\n",
+ " 'dunno',\n",
+ " 'shot',\n",
+ " 'final',\n",
+ " 'yaayyy',\n",
+ " 'ashley',\n",
+ " 'tisdal',\n",
+ " 'germani',\n",
+ " 'comet',\n",
+ " 'hope',\n",
+ " 'sometim',\n",
+ " 'signal',\n",
+ " 'area',\n",
+ " 'felt',\n",
+ " 'lone',\n",
+ " 'moneymarv',\n",
+ " 'sup',\n",
+ " 'kuzzo',\n",
+ " 'yo',\n",
+ " 'tyme',\n",
+ " 'moro',\n",
+ " 'send',\n",
+ " 'wife',\n",
+ " 'airport',\n",
+ " 'ill',\n",
+ " 'found',\n",
+ " 'familiar',\n",
+ " 'sloooowwww',\n",
+ " 'fav',\n",
+ " 'snl',\n",
+ " 'wish',\n",
+ " 'hey',\n",
+ " 'deserv',\n",
+ " 'vacat',\n",
+ " 'nyc',\n",
+ " 'though',\n",
+ " 'congrat',\n",
+ " 'chri',\n",
+ " 'krisztian',\n",
+ " 'miszer',\n",
+ " 'beauti',\n",
+ " 'member',\n",
+ " 'kcsd',\n",
+ " 'must',\n",
+ " 'end',\n",
+ " 'yet',\n",
+ " 'summer',\n",
+ " 'bring',\n",
+ " 'mani',\n",
+ " 'cup',\n",
+ " 'hot',\n",
+ " 'tea',\n",
+ " 'honey',\n",
+ " 'could',\n",
+ " 'rid',\n",
+ " 'sore',\n",
+ " 'throat',\n",
+ " 'damn',\n",
+ " 'htc',\n",
+ " 'delay',\n",
+ " 'vodafon',\n",
+ " 'wait',\n",
+ " 'langer',\n",
+ " 'marleematlin',\n",
+ " 'genesimmon',\n",
+ " 'real',\n",
+ " 'alreadi',\n",
+ " 'followfriday',\n",
+ " 'itvdotcom',\n",
+ " 'rel',\n",
+ " 'emma',\n",
+ " 'ticket',\n",
+ " 'thursday',\n",
+ " 'wednesday',\n",
+ " 'play',\n",
+ " 'sim',\n",
+ " 'badli',\n",
+ " 'famili',\n",
+ " 'liam',\n",
+ " 'lime',\n",
+ " 'havent',\n",
+ " 'myspac',\n",
+ " 'half',\n",
+ " 'least',\n",
+ " 'tube',\n",
+ " 'finish',\n",
+ " 'chem',\n",
+ " 'part',\n",
+ " 'anyon',\n",
+ " 'idea',\n",
+ " 'sweetestcassi',\n",
+ " 'nothin',\n",
+ " 'werk',\n",
+ " 'nite',\n",
+ " 'cmaxcoop',\n",
+ " 'meant',\n",
+ " 'dri',\n",
+ " 'water',\n",
+ " 'giggl',\n",
+ " 'ever',\n",
+ " 'readi',\n",
+ " 'ladi',\n",
+ " 'mode',\n",
+ " 'shower',\n",
+ " 'wat',\n",
+ " 'happen',\n",
+ " 'meee',\n",
+ " 'krissi',\n",
+ " 'true',\n",
+ " 'somewher',\n",
+ " 'london',\n",
+ " 'manchest',\n",
+ " 'murder',\n",
+ " 'everi',\n",
+ " 'seem',\n",
+ " 'morrigan',\n",
+ " 'hug',\n",
+ " 'blfc',\n",
+ " 'yesterday',\n",
+ " 'sarmi',\n",
+ " 'drama',\n",
+ " 'calisu',\n",
+ " 'plant',\n",
+ " 'tree',\n",
+ " 'honor',\n",
+ " 'father',\n",
+ " 'bet',\n",
+ " 'busi',\n",
+ " 'jwilphoto',\n",
+ " 'keyshia',\n",
+ " 'truth',\n",
+ " 'speak',\n",
+ " 'concert',\n",
+ " 'theduncan',\n",
+ " 'quit',\n",
+ " 'confus',\n",
+ " 'control',\n",
+ " 'yay',\n",
+ " 'easier',\n",
+ " 'montanatuck',\n",
+ " 'hmm',\n",
+ " 'serious',\n",
+ " 'mmmm',\n",
+ " 'tiramisu',\n",
+ " 'beach',\n",
+ " 'haz',\n",
+ " 'colleg',\n",
+ " 'okay',\n",
+ " 'bird',\n",
+ " 'chirp',\n",
+ " 'attempt',\n",
+ " 'ish',\n",
+ " 'toodl',\n",
+ " 'juaner',\n",
+ " 'ahh',\n",
+ " 'juan',\n",
+ " 'dymilkshak',\n",
+ " 'soon',\n",
+ " 'tail',\n",
+ " 'write',\n",
+ " 'stupid',\n",
+ " 'pen',\n",
+ " 'took',\n",
+ " 'bobbytommi',\n",
+ " 'yeahh',\n",
+ " 'let',\n",
+ " 'msja',\n",
+ " 'hate',\n",
+ " 'group',\n",
+ " 'alyxxdion',\n",
+ " 'jenni',\n",
+ " 'jonasbroth',\n",
+ " 'liverpool',\n",
+ " 'co',\n",
+ " 'beatl',\n",
+ " 'live',\n",
+ " 'zuton',\n",
+ " 'dtown',\n",
+ " 'vanessacvaldez',\n",
+ " 'mexican',\n",
+ " 'blog',\n",
+ " 'never',\n",
+ " 'cuz',\n",
+ " 'lazi',\n",
+ " 'endlesswhimsi',\n",
+ " 'post',\n",
+ " 'boyfriend',\n",
+ " 'veri',\n",
+ " 'cycl',\n",
+ " 'buddi',\n",
+ " 'fell',\n",
+ " 'maddenlov',\n",
+ " 'understand',\n",
+ " 'exist',\n",
+ " 'aaahhhhhhhhhhhhhhhhhhhhhhhhhh',\n",
+ " 'nom',\n",
+ " 'kinda',\n",
+ " 'raini',\n",
+ " 'queesi',\n",
+ " 'fruster',\n",
+ " 'anger',\n",
+ " 'team',\n",
+ " 'staceyfreeadr',\n",
+ " 'stacey',\n",
+ " 'bobbi',\n",
+ " 'comment',\n",
+ " 'alan',\n",
+ " 'fwd',\n",
+ " 'name',\n",
+ " 'afternoon',\n",
+ " 'rather',\n",
+ " 'relax',\n",
+ " 'conserv',\n",
+ " 'cdwow',\n",
+ " 'simpl',\n",
+ " 'mind',\n",
+ " 'amazon',\n",
+ " 'canceld',\n",
+ " 'biomekkanik',\n",
+ " 'album',\n",
+ " 'vnv',\n",
+ " 'nation',\n",
+ " 'arriv',\n",
+ " 'tommcflyi',\n",
+ " 'freak',\n",
+ " 'repli',\n",
+ " 'joezor',\n",
+ " 'point',\n",
+ " 'win',\n",
+ " 'best',\n",
+ " 'excus',\n",
+ " 'swim',\n",
+ " 'mradamlambert',\n",
+ " 'normal',\n",
+ " 'would',\n",
+ " 'sinc',\n",
+ " 'isnt',\n",
+ " 'amon',\n",
+ " 'bore',\n",
+ " 'gypsyraven',\n",
+ " 'tub',\n",
+ " 'logi',\n",
+ " 'x',\n",
+ " 'liz',\n",
+ " 'tattoo',\n",
+ " 'cash',\n",
+ " 'nikkibenz',\n",
+ " 'jaylastarr',\n",
+ " 'nighti',\n",
+ " 'kind',\n",
+ " 'jennif',\n",
+ " 'either',\n",
+ " 'kentucki',\n",
+ " 'derbi',\n",
+ " 'mint',\n",
+ " 'julep',\n",
+ " 'situp',\n",
+ " 'km',\n",
+ " 'build',\n",
+ " 'gunz',\n",
+ " 'dreadkey',\n",
+ " 'k',\n",
+ " 'batter',\n",
+ " 'bruis',\n",
+ " 'arm',\n",
+ " 'tambourin',\n",
+ " 'self',\n",
+ " 'conflict',\n",
+ " 'qc',\n",
+ " 'great',\n",
+ " 'souleyede',\n",
+ " 'tue',\n",
+ " 'jame',\n",
+ " 'phelp',\n",
+ " 'hedgehog',\n",
+ " 'sega',\n",
+ " 'mega',\n",
+ " 'drive',\n",
+ " 'game',\n",
+ " 'almond',\n",
+ " 'nut',\n",
+ " 'furr',\n",
+ " 'victori',\n",
+ " 'ze',\n",
+ " 'dane',\n",
+ " 'bob',\n",
+ " 'dylan',\n",
+ " 'aint',\n",
+ " 'babe',\n",
+ " 'import',\n",
+ " 'iim',\n",
+ " 'late',\n",
+ " 'forward',\n",
+ " 'iphon',\n",
+ " 'softwar',\n",
+ " 'updat',\n",
+ " 'tallerguy',\n",
+ " 'sigh',\n",
+ " 'full',\n",
+ " 'proce',\n",
+ " 'lovin',\n",
+ " 'georg',\n",
+ " 'gina',\n",
+ " 'luci',\n",
+ " 'bag',\n",
+ " 'yazeez',\n",
+ " 'pink',\n",
+ " 'amyk',\n",
+ " 'geek',\n",
+ " 'label',\n",
+ " 'dymo',\n",
+ " 'labelwrit',\n",
+ " 'fall',\n",
+ " 'catagori',\n",
+ " 'appar',\n",
+ " 'moekelsak',\n",
+ " 'via',\n",
+ " 'owl',\n",
+ " 'save',\n",
+ " 'reali',\n",
+ " 'togeth',\n",
+ " 'farewel',\n",
+ " 'hame',\n",
+ " 'mtstanford',\n",
+ " 'kno',\n",
+ " 'luck',\n",
+ " 'goodev',\n",
+ " 'peopl',\n",
+ " 'dish',\n",
+ " 'si',\n",
+ " 'came',\n",
+ " 'yell',\n",
+ " 'camera',\n",
+ " 'bit',\n",
+ " 'blitz',\n",
+ " 'parent',\n",
+ " 'dundundun',\n",
+ " 'uni',\n",
+ " 'revis',\n",
+ " 'sunshine',\n",
+ " 'interview',\n",
+ " 'epicwat',\n",
+ " 'add',\n",
+ " 'heh',\n",
+ " 'soooo',\n",
+ " 'sleepi',\n",
+ " 'ben',\n",
+ " 'tommcfli',\n",
+ " 'pleaaaaas',\n",
+ " 'steffi',\n",
+ " 'ah',\n",
+ " 'almost',\n",
+ " 'ima',\n",
+ " 'lt',\n",
+ " 'chip',\n",
+ " 'mad',\n",
+ " 'season',\n",
+ " 'far',\n",
+ " 'flawlessli',\n",
+ " 'pleasebitem',\n",
+ " 'total',\n",
+ " 'fair',\n",
+ " 'daughter',\n",
+ " 'bday',\n",
+ " 'parti',\n",
+ " 'theme',\n",
+ " 'hello',\n",
+ " 'kitti',\n",
+ " 'cake',\n",
+ " 'pattygal',\n",
+ " 'patti',\n",
+ " 'agre',\n",
+ " 'itsashlz',\n",
+ " 'mmmmmwwwwaaaaaarrrrr',\n",
+ " 'therobotard',\n",
+ " 'tsnydermtg',\n",
+ " 'jump',\n",
+ " 'said',\n",
+ " 'layer',\n",
+ " 'onion',\n",
+ " 'approach',\n",
+ " 'met',\n",
+ " 'ef',\n",
+ " 'rain',\n",
+ " 'wear',\n",
+ " 'blue',\n",
+ " 'poncho',\n",
+ " 'sctape',\n",
+ " 'youngq',\n",
+ " 'gr',\n",
+ " 'rob',\n",
+ " 'mixingtp',\n",
+ " 'nitenit',\n",
+ " 'st',\n",
+ " 'month',\n",
+ " 'woo',\n",
+ " 'hoo',\n",
+ " 'june',\n",
+ " 'punish',\n",
+ " 'claireolivar',\n",
+ " 'moon',\n",
+ " 'butt',\n",
+ " 'grrrrrrrrr',\n",
+ " 'sooooooooooo',\n",
+ " 'darn',\n",
+ " 'slow',\n",
+ " 'stephanieellen',\n",
+ " 'thanx',\n",
+ " 'headach',\n",
+ " 'advic',\n",
+ " 'eye',\n",
+ " 'longest',\n",
+ " 'lucr',\n",
+ " 'career',\n",
+ " 'gomeztheband',\n",
+ " 'sort',\n",
+ " 'kept',\n",
+ " 'toe',\n",
+ " 'ramesrandrew',\n",
+ " 'interest',\n",
+ " 'hear',\n",
+ " 'handsfre',\n",
+ " 'bork',\n",
+ " 'silent',\n",
+ " 'hide',\n",
+ " 'rock',\n",
+ " 'nadtriadina',\n",
+ " 'glenanderson',\n",
+ " 'wors',\n",
+ " 'booooo',\n",
+ " 'earthshinedesig',\n",
+ " 'arduino',\n",
+ " 'broke',\n",
+ " 'atmega',\n",
+ " 'pack',\n",
+ " 'muahahahah',\n",
+ " 'hangov',\n",
+ " 'mcgiff',\n",
+ " 'join',\n",
+ " 'smile',\n",
+ " 'lunchtim',\n",
+ " 'weight',\n",
+ " 'except',\n",
+ " 'cheat',\n",
+ " 'margiethiel',\n",
+ " 'advantag',\n",
+ " 'beerealti',\n",
+ " 'dawniecahil',\n",
+ " 'awww',\n",
+ " 'hillari',\n",
+ " 'idk',\n",
+ " 'tatt',\n",
+ " 'wee',\n",
+ " 'secur',\n",
+ " 'jennykiwi',\n",
+ " 'local',\n",
+ " 'pet',\n",
+ " 'store',\n",
+ " 'sell',\n",
+ " 'cock',\n",
+ " 'flight',\n",
+ " 'slum',\n",
+ " 'departur',\n",
+ " 'loung',\n",
+ " 'zckenni',\n",
+ " 'consult',\n",
+ " 'recommend',\n",
+ " 'someon',\n",
+ " 'stila',\n",
+ " 'unverifi',\n",
+ " 'sourc',\n",
+ " 'ltte',\n",
+ " 'colombo',\n",
+ " 'investig',\n",
+ " 'bodi',\n",
+ " 'excit',\n",
+ " 'carm',\n",
+ " 'lngoeyegrl',\n",
+ " 'glad',\n",
+ " 'celebr',\n",
+ " 'nicolejp',\n",
+ " 'sumrheat',\n",
+ " 'decid',\n",
+ " 'meet',\n",
+ " 'backstag',\n",
+ " 'gether',\n",
+ " 'cal',\n",
+ " 'nalgen',\n",
+ " 'bottl',\n",
+ " 'dehydr',\n",
+ " 'shrivel',\n",
+ " 'raisin',\n",
+ " 'chemistri',\n",
+ " 'baaahh',\n",
+ " 'mileymonday',\n",
+ " 'retweet',\n",
+ " 'onload',\n",
+ " 'alert',\n",
+ " 'gt',\n",
+ " 'fail',\n",
+ " 'shadowsierra',\n",
+ " 'cage',\n",
+ " 'anywher',\n",
+ " 'boy',\n",
+ " 'lame',\n",
+ " 'men',\n",
+ " 'buttt',\n",
+ " 'lmfao',\n",
+ " 'tho',\n",
+ " 'poor',\n",
+ " 'sarah',\n",
+ " 'allieandra',\n",
+ " 'harriettaa',\n",
+ " 'cur',\n",
+ " 'imperi',\n",
+ " 'non',\n",
+ " 'metric',\n",
+ " 'system',\n",
+ " 'confound',\n",
+ " 'stay',\n",
+ " 'rove',\n",
+ " 'chariti',\n",
+ " 'support',\n",
+ " 'logoguppi',\n",
+ " 'john',\n",
+ " 'similar',\n",
+ " 'geni',\n",
+ " 'aladdin',\n",
+ " 'americanapparel',\n",
+ " 'surviv',\n",
+ " 'quak',\n",
+ " 'option',\n",
+ " 'deep',\n",
+ " 'crew',\n",
+ " 'neck',\n",
+ " 'sister',\n",
+ " 'collar',\n",
+ " 'bone',\n",
+ " 'broken',\n",
+ " 'lthagreat',\n",
+ " 'twonswaggcheck',\n",
+ " 'ohhhh',\n",
+ " 'digg',\n",
+ " 'sameerpatel',\n",
+ " 'product',\n",
+ " 'box',\n",
+ " 'hilari',\n",
+ " 'kate',\n",
+ " 'expect',\n",
+ " 'faster',\n",
+ " 'pacquiao',\n",
+ " 'fight',\n",
+ " 'monkaaay',\n",
+ " 'lil',\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "count_freq= nltk.FreqDist()\n",
+ "\n",
+ "for tweet in sample[\"text_processed\"]:\n",
+ " for word in tweet:\n",
+ " count_freq[word]+=1\n",
+ "\n",
+ "top_5k_words= list(count_freq.keys())[:5000]\n",
+ "top_5k_words"
]
},
{
@@ -167,11 +1520,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20000\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
+ "\n",
+ "# your code here\n",
+ "def find_features(document):\n",
+ " words = set(document)\n",
+ " features = {}\n",
+ " for w in top_5k_words:\n",
+ " features[w] = (w in words)\n",
+ " return features\n",
+ "\n",
+ "features_set= [(find_features(tweet), target) for (tweet, target) in list(zip(sample['text_processed'], sample['target']))]\n",
+ "print(len(features_set))"
]
},
{
@@ -210,11 +1581,24 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "# your code here\n",
+ "\n",
+ "from textblob.classifiers import NaiveBayesClassifier\n",
+ "train_set,test_set = features_set[:10000],features_set[10000:]\n",
+ "classifier = nltk.NaiveBayesClassifier.train(train_set)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier.show_most_informative_features()"
]
},
{
@@ -298,7 +1682,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3.9.12 ('base')",
"language": "python",
"name": "python3"
},
@@ -312,7 +1696,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.9.12"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
+ }
}
},
"nbformat": 4,