diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..e292086 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,9 +66,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
+ "source": [
+ "import re\n",
+ "import nltk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ironhack s q website is'"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def clean_up(s):\n",
" \"\"\"\n",
@@ -79,7 +100,14 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " string = re.sub(r'http\\S+', '', s)\n",
+ " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+ " \n",
+ "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n",
+ "\n",
+ "test_string = clean_up(test)\n",
+ "test_string"
]
},
{
@@ -101,9 +129,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'website', 'is']"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def tokenize(s):\n",
" \"\"\"\n",
@@ -114,7 +153,11 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " return nltk.word_tokenize(s)\n",
+ "\n",
+ "test_string = tokenize(test_string)\n",
+ "test_string"
]
},
{
@@ -145,11 +188,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['ironhack', 's', 'q', 'websit', 'is']\n"
+ ]
+ }
+ ],
"source": [
"def stem_and_lemmatize(l):\n",
+ " \n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
@@ -158,7 +210,17 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " ps = nltk.PorterStemmer()\n",
+ " lemmatizer = nltk.WordNetLemmatizer()\n",
+ " l2 = []\n",
+ " \n",
+ " for w in l:\n",
+ " s = ps.stem(w)\n",
+ " s = lemmatizer.lemmatize(s)\n",
+ " l2 += [s]\n",
+ " \n",
+ " return l2\n"
]
},
{
@@ -176,10 +238,19 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 24,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ironhack q website\n"
+ ]
+ }
+ ],
"source": [
+ "from nltk.corpus import stopwords\n",
"def remove_stopwords(l):\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
@@ -189,7 +260,12 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " stop_words = stopwords.words('english')\n",
+ "\n",
+ " return ' '.join([w for w in l if w not in stop_words])\n",
+ "\n",
+ "print(remove_stopwords(test_string))"
]
},
{
@@ -204,7 +280,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -218,7 +294,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.10.4"
}
},
"nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..6c11cb8 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -37,20 +37,109 @@
"\n",
"### Loading and Exploring Data\n",
"\n",
- "The dataset we'll be using today is located on Kaggle (https://www.kaggle.com/kazanova/sentiment140). Once you have downloaded and imported the dataset, it you will need to define the columns names: df.columns = ['target','id','date','flag','user','text']\n",
+ "The dataset we'll be using today is located in the lab directory named `Sentiment140.csv.zip`. You need to unzip it into a `.csv` file. Then in the cell below, load and explore the data.\n",
"\n",
"*Notes:* \n",
"\n",
+ "* The dataset was downloaded from [Kaggle](https://www.kaggle.com/kazanova/sentiment140). We made a slight change on the original data so that each column has a label.\n",
+ "\n",
"* The dataset is huuuuge (1.6m tweets). When you develop your data analysis codes, you can sample a subset of the data (e.g. 20k records) so that you will save a lot of time when you test your codes."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from nltk.corpus import stopwords\n",
+ "import re\n",
+ "import nltk\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from nltk.probability import ConditionalFreqDist"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_up(s):\n",
+ " \"\"\"\n",
+ " Cleans up numbers, URLs, and special characters from a string.\n",
+ "\n",
+ " Args:\n",
+ " s: The string to be cleaned up.\n",
+ "\n",
+ " Returns:\n",
+ " A string that has been cleaned up.\n",
+ " \"\"\"\n",
+ " string = re.sub(r'http\\S+', '', s)\n",
+ " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+ "\n",
+ "def tokenize(s):\n",
+ " \"\"\"\n",
+ " Tokenize a string.\n",
+ "\n",
+ " Args:\n",
+ " s: String to be tokenized.\n",
+ "\n",
+ " Returns:\n",
+ " A list of words as the result of tokenization.\n",
+ " \"\"\"\n",
+ " return nltk.word_tokenize(s)\n",
+ "\n",
+ "def stem_and_lemmatize(l):\n",
+ " \n",
+ " \"\"\"\n",
+ " Perform stemming and lemmatization on a list of words.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after being stemmed and lemmatized.\n",
+ " \"\"\"\n",
+ " ps = nltk.PorterStemmer()\n",
+ " lemmatizer = nltk.WordNetLemmatizer()\n",
+ " l2 = []\n",
+ " \n",
+ " for w in l:\n",
+ " s = ps.stem(w)\n",
+ " s = lemmatizer.lemmatize(s)\n",
+ " l2 += [s]\n",
+ " \n",
+ " return l2\n",
+ "\n",
+ "\n",
+ "def remove_stopwords(l):\n",
+ " \"\"\"\n",
+ " Remove English stopwords from a list of strings.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after stop words are removed.\n",
+ " \"\"\"\n",
+ " stop_words = stopwords.words('english')\n",
+ "\n",
+ " return [w for w in l if w not in stop_words]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweets = pd.read_csv('C:/Users/Zaca/Documents/Datasets/sentiment140.csv')\n",
+ "sample = tweets.sample(20000)\n",
+ "sample['target'] = sample['target'].replace(4, 1)"
]
},
{
@@ -76,11 +165,206 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 471686 | \n",
+ " 0 | \n",
+ " 2176677449 | \n",
+ " Mon Jun 15 04:29:23 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " JessicaShiree | \n",
+ " Last day of classes with my Day 1 kids | \n",
+ " [last, day, class, day, kid] | \n",
+ "
\n",
+ " \n",
+ " | 1568754 | \n",
+ " 1 | \n",
+ " 2188268260 | \n",
+ " Mon Jun 15 21:35:26 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " lcmelody | \n",
+ " Rant over. Now it's time for me to actually ge... | \n",
+ " [rant, time, actual, get] | \n",
+ "
\n",
+ " \n",
+ " | 1514170 | \n",
+ " 1 | \n",
+ " 2175446548 | \n",
+ " Mon Jun 15 01:00:40 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " tamzinaki | \n",
+ " @Tsaksonakis love that song.don't remember it ... | \n",
+ " [tsaksonaki, love, song, rememb, bit, f, amp, ... | \n",
+ "
\n",
+ " \n",
+ " | 1013269 | \n",
+ " 1 | \n",
+ " 1881296235 | \n",
+ " Fri May 22 04:14:15 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Eghie_Dy | \n",
+ " I'm so happy | \n",
+ " [happi] | \n",
+ "
\n",
+ " \n",
+ " | 443613 | \n",
+ " 0 | \n",
+ " 2067603654 | \n",
+ " Sun Jun 07 12:38:14 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " icysun23 | \n",
+ " @decorus I DONT KNOW!!! | \n",
+ " [decoru, dont, know] | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1110391 | \n",
+ " 1 | \n",
+ " 1972049147 | \n",
+ " Sat May 30 08:27:15 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " iCasandy | \n",
+ " @SoulGlowActivtr dat song is da bomb!!! | \n",
+ " [soulglowactivtr, dat, song, da, bomb] | \n",
+ "
\n",
+ " \n",
+ " | 649306 | \n",
+ " 0 | \n",
+ " 2237251942 | \n",
+ " Fri Jun 19 05:28:19 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " kakaxo | \n",
+ " I go to the hairdresser and then to friends ... | \n",
+ " [go, hairdress, friend, night, lt, ugli, weath... | \n",
+ "
\n",
+ " \n",
+ " | 894195 | \n",
+ " 1 | \n",
+ " 1692177930 | \n",
+ " Sun May 03 19:26:28 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ewindsor | \n",
+ " @aLINEofCOCJIN Ah awesome. Good to hear from ... | \n",
+ " [alineofcocjin, ah, awesom, good, hear] | \n",
+ "
\n",
+ " \n",
+ " | 57698 | \n",
+ " 0 | \n",
+ " 1685723636 | \n",
+ " Sun May 03 01:45:08 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " acaigirl | \n",
+ " @Avie89 sorry to hear that... | \n",
+ " [avi, sorri, hear] | \n",
+ "
\n",
+ " \n",
+ " | 1314188 | \n",
+ " 1 | \n",
+ " 2013877292 | \n",
+ " Tue Jun 02 23:49:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Jellie1981 | \n",
+ " waiting for the train! http://yfrog.com/eha8sj | \n",
+ " [wait, train] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20000 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "471686 0 2176677449 Mon Jun 15 04:29:23 PDT 2009 NO_QUERY \n",
+ "1568754 1 2188268260 Mon Jun 15 21:35:26 PDT 2009 NO_QUERY \n",
+ "1514170 1 2175446548 Mon Jun 15 01:00:40 PDT 2009 NO_QUERY \n",
+ "1013269 1 1881296235 Fri May 22 04:14:15 PDT 2009 NO_QUERY \n",
+ "443613 0 2067603654 Sun Jun 07 12:38:14 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... \n",
+ "1110391 1 1972049147 Sat May 30 08:27:15 PDT 2009 NO_QUERY \n",
+ "649306 0 2237251942 Fri Jun 19 05:28:19 PDT 2009 NO_QUERY \n",
+ "894195 1 1692177930 Sun May 03 19:26:28 PDT 2009 NO_QUERY \n",
+ "57698 0 1685723636 Sun May 03 01:45:08 PDT 2009 NO_QUERY \n",
+ "1314188 1 2013877292 Tue Jun 02 23:49:36 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "471686 JessicaShiree Last day of classes with my Day 1 kids \n",
+ "1568754 lcmelody Rant over. Now it's time for me to actually ge... \n",
+ "1514170 tamzinaki @Tsaksonakis love that song.don't remember it ... \n",
+ "1013269 Eghie_Dy I'm so happy \n",
+ "443613 icysun23 @decorus I DONT KNOW!!! \n",
+ "... ... ... \n",
+ "1110391 iCasandy @SoulGlowActivtr dat song is da bomb!!! \n",
+ "649306 kakaxo I go to the hairdresser and then to friends ... \n",
+ "894195 ewindsor @aLINEofCOCJIN Ah awesome. Good to hear from ... \n",
+ "57698 acaigirl @Avie89 sorry to hear that... \n",
+ "1314188 Jellie1981 waiting for the train! http://yfrog.com/eha8sj \n",
+ "\n",
+ " text_processed \n",
+ "471686 [last, day, class, day, kid] \n",
+ "1568754 [rant, time, actual, get] \n",
+ "1514170 [tsaksonaki, love, song, rememb, bit, f, amp, ... \n",
+ "1013269 [happi] \n",
+ "443613 [decoru, dont, know] \n",
+ "... ... \n",
+ "1110391 [soulglowactivtr, dat, song, da, bomb] \n",
+ "649306 [go, hairdress, friend, night, lt, ugli, weath... \n",
+ "894195 [alineofcocjin, ah, awesom, good, hear] \n",
+ "57698 [avi, sorri, hear] \n",
+ "1314188 [wait, train] \n",
+ "\n",
+ "[20000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "sample['text_processed'] = sample['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)\n",
+ "sample"
]
},
{
@@ -98,11 +382,1029 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['last',\n",
+ " 'day',\n",
+ " 'class',\n",
+ " 'kid',\n",
+ " 'rant',\n",
+ " 'time',\n",
+ " 'actual',\n",
+ " 'get',\n",
+ " 'tsaksonaki',\n",
+ " 'love',\n",
+ " 'song',\n",
+ " 'rememb',\n",
+ " 'bit',\n",
+ " 'f',\n",
+ " 'amp',\n",
+ " 'l',\n",
+ " 'saw',\n",
+ " 'actor',\n",
+ " 'studio',\n",
+ " 'happi',\n",
+ " 'decoru',\n",
+ " 'dont',\n",
+ " 'know',\n",
+ " 'rebeccamezzino',\n",
+ " 'hello',\n",
+ " 'bec',\n",
+ " 'long',\n",
+ " 'melbourn',\n",
+ " 'abl',\n",
+ " 'make',\n",
+ " 'tweetup',\n",
+ " 'ugh',\n",
+ " 'im',\n",
+ " 'tire',\n",
+ " 'hardli',\n",
+ " 'anyth',\n",
+ " 'play',\n",
+ " 'niec',\n",
+ " 'goin',\n",
+ " 'bed',\n",
+ " 'goodnight',\n",
+ " 'eveyon',\n",
+ " 'lt',\n",
+ " 'ha',\n",
+ " 'reject',\n",
+ " 'back',\n",
+ " 'amaz',\n",
+ " 'two',\n",
+ " 'never',\n",
+ " 'forget',\n",
+ " 'feel',\n",
+ " 'break',\n",
+ " 'pool',\n",
+ " 'swim',\n",
+ " 'middl',\n",
+ " 'lightn',\n",
+ " 'storm',\n",
+ " 'sunni',\n",
+ " 'xtineismyhero',\n",
+ " 'happen',\n",
+ " 'darl',\n",
+ " 'laffit',\n",
+ " 'still',\n",
+ " 'salon',\n",
+ " 'send',\n",
+ " 'pic',\n",
+ " 'eaten',\n",
+ " 'aliv',\n",
+ " 'mosquito',\n",
+ " 'post',\n",
+ " 'waaaaay',\n",
+ " 'much',\n",
+ " 'forgot',\n",
+ " 'wa',\n",
+ " 'gon',\n",
+ " 'na',\n",
+ " 'say',\n",
+ " 'mom',\n",
+ " 'invit',\n",
+ " 'parti',\n",
+ " 'go',\n",
+ " 'though',\n",
+ " 'veilin',\n",
+ " 'miss',\n",
+ " 'see',\n",
+ " 'like',\n",
+ " 'month',\n",
+ " 'alreadi',\n",
+ " 'think',\n",
+ " 'look',\n",
+ " 'xd',\n",
+ " 'chemistri',\n",
+ " 'revison',\n",
+ " 'well',\n",
+ " 'bore',\n",
+ " 'hate',\n",
+ " 'peopl',\n",
+ " 'finish',\n",
+ " 'exam',\n",
+ " 'festivalfan',\n",
+ " 'u',\n",
+ " 'start',\n",
+ " 'eat',\n",
+ " 'carb',\n",
+ " 'ur',\n",
+ " 'bodi',\n",
+ " 'wont',\n",
+ " 'weight',\n",
+ " 'come',\n",
+ " 'fast',\n",
+ " 'ive',\n",
+ " 'clean',\n",
+ " 'deserv',\n",
+ " 'recognit',\n",
+ " 'mother',\n",
+ " 'caus',\n",
+ " 'problem',\n",
+ " 'instead',\n",
+ " 'montyrul',\n",
+ " 'pearllow',\n",
+ " 'andi',\n",
+ " 'wish',\n",
+ " 'luck',\n",
+ " 'gig',\n",
+ " 'glad',\n",
+ " 'went',\n",
+ " 'glasto',\n",
+ " 'xxxxxx',\n",
+ " 'got',\n",
+ " 'ta',\n",
+ " 'pack',\n",
+ " 'trip',\n",
+ " 'daddi',\n",
+ " 'granni',\n",
+ " 'poo',\n",
+ " 'wait',\n",
+ " 'midnight',\n",
+ " 'jona',\n",
+ " 'brother',\n",
+ " 'new',\n",
+ " 'album',\n",
+ " 'tomorrow',\n",
+ " 'dead',\n",
+ " 'zoran',\n",
+ " 'lost',\n",
+ " 'croatian',\n",
+ " 'idol',\n",
+ " 'differ',\n",
+ " 'le',\n",
+ " 'vote',\n",
+ " 'prepar',\n",
+ " 'univers',\n",
+ " 'great',\n",
+ " 'hook',\n",
+ " 'learn',\n",
+ " 'read',\n",
+ " 'yesterday',\n",
+ " 'hope',\n",
+ " 'sloan',\n",
+ " 'download',\n",
+ " 'trvsdjam',\n",
+ " 'mixtap',\n",
+ " 'unzip',\n",
+ " 'invalid',\n",
+ " 'corrupt',\n",
+ " 'krishnakum',\n",
+ " 'told',\n",
+ " 'ya',\n",
+ " 'uber',\n",
+ " 'cool',\n",
+ " 'treat',\n",
+ " 'omen',\n",
+ " 'yoo',\n",
+ " 'sittin',\n",
+ " 'car',\n",
+ " 'crazi',\n",
+ " 'ish',\n",
+ " 'rite',\n",
+ " 'man',\n",
+ " 'ima',\n",
+ " 'nitemar',\n",
+ " 'tonit',\n",
+ " 'bryci',\n",
+ " 'seen',\n",
+ " 'ure',\n",
+ " 'stpatrick',\n",
+ " 'show',\n",
+ " 'amazin',\n",
+ " 'person',\n",
+ " 'take',\n",
+ " 'earlier',\n",
+ " 'comment',\n",
+ " 'b',\n",
+ " 'superstar',\n",
+ " 'synwpn',\n",
+ " 'whi',\n",
+ " 'wan',\n",
+ " 'stalk',\n",
+ " 'anyway',\n",
+ " 'chelseamoss',\n",
+ " 'everyth',\n",
+ " 'summer',\n",
+ " 'far',\n",
+ " 'good',\n",
+ " 'ashleigharsen',\n",
+ " 'also',\n",
+ " 'lemonhead',\n",
+ " 'delici',\n",
+ " 'plea',\n",
+ " 'compar',\n",
+ " 'one',\n",
+ " 'latinegro',\n",
+ " 'unfortun',\n",
+ " 'red',\n",
+ " 'robin',\n",
+ " 'famili',\n",
+ " 'friend',\n",
+ " 'perfect',\n",
+ " 'church',\n",
+ " 'lunch',\n",
+ " 'lora',\n",
+ " 'text',\n",
+ " 'around',\n",
+ " 'watch',\n",
+ " 'oc',\n",
+ " 'ye',\n",
+ " 'knockin',\n",
+ " 'night',\n",
+ " 'pointforwardpro',\n",
+ " 'omg',\n",
+ " 'thnk',\n",
+ " 'remind',\n",
+ " 'colleg',\n",
+ " 'right',\n",
+ " 'everytim',\n",
+ " 'spend',\n",
+ " 'front',\n",
+ " 'comput',\n",
+ " 'hurt',\n",
+ " 'home',\n",
+ " 'put',\n",
+ " 'bedtim',\n",
+ " 'becaus',\n",
+ " 'want',\n",
+ " 'sleep',\n",
+ " 'alon',\n",
+ " 'feliz',\n",
+ " 'de',\n",
+ " 'la',\n",
+ " 'madr',\n",
+ " 'godmommi',\n",
+ " 'sign',\n",
+ " 'contract',\n",
+ " 'apart',\n",
+ " 'need',\n",
+ " 'email',\n",
+ " 'contact',\n",
+ " 'check',\n",
+ " 'emailunlimit',\n",
+ " 'final',\n",
+ " 'broke',\n",
+ " 'hi',\n",
+ " 'casino',\n",
+ " 'habit',\n",
+ " 'lose',\n",
+ " 'next',\n",
+ " 'week',\n",
+ " 'afraid',\n",
+ " 'nightmar',\n",
+ " 'chees',\n",
+ " 'befor',\n",
+ " 'effect',\n",
+ " 'ravioli',\n",
+ " 'grill',\n",
+ " 'twitter',\n",
+ " 'product',\n",
+ " 'velvet',\n",
+ " 'cake',\n",
+ " 'speak',\n",
+ " 'sofiedevil',\n",
+ " 'left',\n",
+ " 'phone',\n",
+ " 'lie',\n",
+ " 'cough',\n",
+ " 'wonder',\n",
+ " 'anyon',\n",
+ " 'die',\n",
+ " 'seedi',\n",
+ " 'motel',\n",
+ " 'zombi',\n",
+ " 'shark',\n",
+ " 'tuesday',\n",
+ " 'woke',\n",
+ " 'pm',\n",
+ " 'fall',\n",
+ " 'asleep',\n",
+ " 'suck',\n",
+ " 'hour',\n",
+ " 'ie',\n",
+ " 'greek',\n",
+ " 'easter',\n",
+ " 'asian',\n",
+ " 'twist',\n",
+ " 'ashalale',\n",
+ " 'soo',\n",
+ " 'creativ',\n",
+ " 'gt',\n",
+ " 'thi',\n",
+ " 'must',\n",
+ " 'find',\n",
+ " 'someth',\n",
+ " 'fun',\n",
+ " 'blkpanther',\n",
+ " 'way',\n",
+ " 'seem',\n",
+ " 'somali',\n",
+ " 'civil',\n",
+ " 'war',\n",
+ " 'roto',\n",
+ " 'sadli',\n",
+ " 'espn',\n",
+ " 'gener',\n",
+ " 'medium',\n",
+ " 'kind',\n",
+ " 'thing',\n",
+ " 'amournoir',\n",
+ " 'work',\n",
+ " 'casualcottag',\n",
+ " 'big',\n",
+ " 'smiley',\n",
+ " 'face',\n",
+ " 'mybigg',\n",
+ " 'lolz',\n",
+ " 'wnba',\n",
+ " 'wrong',\n",
+ " 'live',\n",
+ " 'access',\n",
+ " 'graphic',\n",
+ " 'blog',\n",
+ " 'oprah',\n",
+ " 'thank',\n",
+ " 'share',\n",
+ " 'alexrk',\n",
+ " 'haha',\n",
+ " 'true',\n",
+ " 'alex',\n",
+ " 'ahhh',\n",
+ " 'anoth',\n",
+ " 'soon',\n",
+ " 'hmmm',\n",
+ " 'white',\n",
+ " 'tri',\n",
+ " 'chase',\n",
+ " 'couch',\n",
+ " 'yell',\n",
+ " 'rm',\n",
+ " 'poorer',\n",
+ " 'accident',\n",
+ " 'gave',\n",
+ " 'discount',\n",
+ " 'magazin',\n",
+ " 'sob',\n",
+ " 'twit',\n",
+ " 'juli',\n",
+ " 'nope',\n",
+ " 'yet',\n",
+ " 'scienc',\n",
+ " 'summ',\n",
+ " 'today',\n",
+ " 'jame',\n",
+ " 'buckley',\n",
+ " 'would',\n",
+ " 'end',\n",
+ " 'till',\n",
+ " 'weekend',\n",
+ " 'reschedul',\n",
+ " 'shower',\n",
+ " 'water',\n",
+ " 'frozen',\n",
+ " 'readi',\n",
+ " 'alway',\n",
+ " 'room',\n",
+ " 'veri',\n",
+ " 'hot',\n",
+ " 'sazp',\n",
+ " 'lush',\n",
+ " 'realiz',\n",
+ " 'tune',\n",
+ " 'thesixtyon',\n",
+ " 'com',\n",
+ " 'coupl',\n",
+ " 'nice',\n",
+ " 'daniboo',\n",
+ " 'hous',\n",
+ " 'woah',\n",
+ " 'rock',\n",
+ " 'life',\n",
+ " 'jesu',\n",
+ " 'takin',\n",
+ " 'step',\n",
+ " 'stuff',\n",
+ " 'food',\n",
+ " 'montanaon',\n",
+ " 'could',\n",
+ " 'flight',\n",
+ " 'agentpatgillen',\n",
+ " 'sure',\n",
+ " 'feelin',\n",
+ " 'trulli',\n",
+ " 'stune',\n",
+ " 'mr',\n",
+ " 'david',\n",
+ " 'carradin',\n",
+ " 'kungfu',\n",
+ " 'movi',\n",
+ " 'lushi',\n",
+ " 'dread',\n",
+ " 'deni',\n",
+ " 'danimarzillo',\n",
+ " 'ouch',\n",
+ " 'slice',\n",
+ " 'top',\n",
+ " 'finger',\n",
+ " 'gloriou',\n",
+ " 'sunshin',\n",
+ " 'bake',\n",
+ " 'browni',\n",
+ " 'sun',\n",
+ " 'yay',\n",
+ " 'boyl',\n",
+ " 'didnt',\n",
+ " 'win',\n",
+ " 'listen',\n",
+ " 'britney',\n",
+ " 'loveeess',\n",
+ " 'xoxo',\n",
+ " 'crush',\n",
+ " 'hardcor',\n",
+ " 'busi',\n",
+ " 'sore',\n",
+ " 'realli',\n",
+ " 'school',\n",
+ " 'ughh',\n",
+ " 'quot',\n",
+ " 'bbq',\n",
+ " 'outsid',\n",
+ " 'sweat',\n",
+ " 'smell',\n",
+ " 'yummi',\n",
+ " 'follwr',\n",
+ " 'drop',\n",
+ " 'hard',\n",
+ " 'tell',\n",
+ " 'spambot',\n",
+ " 'record',\n",
+ " 'stock',\n",
+ " 'nake',\n",
+ " 'ladi',\n",
+ " 'opportun',\n",
+ " 'dianhadinoto',\n",
+ " 'sweet',\n",
+ " 'sharlynnx',\n",
+ " 'aww',\n",
+ " 'naah',\n",
+ " 'favourit',\n",
+ " 'nighti',\n",
+ " 'cours',\n",
+ " 'delet',\n",
+ " 'dawson',\n",
+ " 'creek',\n",
+ " 'ol',\n",
+ " 'ruben',\n",
+ " 'spanish',\n",
+ " 'gp',\n",
+ " 'disappoint',\n",
+ " 'soft',\n",
+ " 'spot',\n",
+ " 'jcookonlin',\n",
+ " 'yeahhh',\n",
+ " 'mean',\n",
+ " 'someon',\n",
+ " 'fairli',\n",
+ " 'close',\n",
+ " 'knew',\n",
+ " 'sad',\n",
+ " 'camera',\n",
+ " 'nsenze',\n",
+ " 'onli',\n",
+ " 'wors',\n",
+ " 'suppos',\n",
+ " 'repli',\n",
+ " 'lalaitsmaria',\n",
+ " 'oh',\n",
+ " 'carliecarrcrash',\n",
+ " 'panaera',\n",
+ " 'four',\n",
+ " 'tonight',\n",
+ " 'best',\n",
+ " 'orlando',\n",
+ " 'girli',\n",
+ " 'backyard',\n",
+ " 'later',\n",
+ " 'afternoon',\n",
+ " 'real',\n",
+ " 'forev',\n",
+ " 'innoc',\n",
+ " 'word',\n",
+ " 'yr',\n",
+ " 'old',\n",
+ " 'hold',\n",
+ " 'even',\n",
+ " 'millionair',\n",
+ " 'driver',\n",
+ " 'spin',\n",
+ " 'crash',\n",
+ " 'super',\n",
+ " 'race',\n",
+ " 'machin',\n",
+ " 'raini',\n",
+ " 'shanghai',\n",
+ " 'simpli',\n",
+ " 'made',\n",
+ " 'fail',\n",
+ " 'darlingnickieb',\n",
+ " 'daaannnnggg',\n",
+ " 'porki',\n",
+ " 'xo',\n",
+ " 'handli',\n",
+ " 'mayb',\n",
+ " 'charlii',\n",
+ " 'yeah',\n",
+ " 'felt',\n",
+ " 'bad',\n",
+ " 'um',\n",
+ " 'ate',\n",
+ " 'rainbow',\n",
+ " 'paddl',\n",
+ " 'pop',\n",
+ " 'xx',\n",
+ " 'sethu',\n",
+ " 'j',\n",
+ " 'thought',\n",
+ " 'exactli',\n",
+ " 'fuck',\n",
+ " 'er',\n",
+ " 'girl',\n",
+ " 'enjoy',\n",
+ " 'wknd',\n",
+ " 'might',\n",
+ " 'landd',\n",
+ " 'foca',\n",
+ " 'nesslle',\n",
+ " 'x',\n",
+ " 'men',\n",
+ " 'fan',\n",
+ " 'paola',\n",
+ " 'total',\n",
+ " 'hugh',\n",
+ " 'gif',\n",
+ " 'cute',\n",
+ " 'norm',\n",
+ " 'cant',\n",
+ " 'tom',\n",
+ " 'isnt',\n",
+ " 'n',\n",
+ " 'fave',\n",
+ " 'guess',\n",
+ " 'hahahha',\n",
+ " 'pretti',\n",
+ " 'tea',\n",
+ " 'thigh',\n",
+ " 'interfac',\n",
+ " 'nrwi',\n",
+ " 'throw',\n",
+ " 'heavi',\n",
+ " 'object',\n",
+ " 'snore',\n",
+ " 'hezmcfli',\n",
+ " 'definit',\n",
+ " 'damn',\n",
+ " 'sick',\n",
+ " 'studi',\n",
+ " 'law',\n",
+ " 'leav',\n",
+ " 'stra',\n",
+ " 'doubl',\n",
+ " 'ea',\n",
+ " 'bradford',\n",
+ " 'aw',\n",
+ " 'shut',\n",
+ " 'lol',\n",
+ " 'superbad',\n",
+ " 'ashkiiwil',\n",
+ " 'sportsgirlsplay',\n",
+ " 'coach',\n",
+ " 'forc',\n",
+ " 'retir',\n",
+ " 'year',\n",
+ " 'ago',\n",
+ " 'due',\n",
+ " 'injuri',\n",
+ " 'terrinixon',\n",
+ " 'laugh',\n",
+ " 'oral',\n",
+ " 'present',\n",
+ " 'monday',\n",
+ " 'whole',\n",
+ " 'johnherman',\n",
+ " 'congrat',\n",
+ " 'deliveri',\n",
+ " 'trust',\n",
+ " 'theoshu',\n",
+ " 'oishi',\n",
+ " 'cheap',\n",
+ " 'satisfi',\n",
+ " 'sushi',\n",
+ " 'soup',\n",
+ " 'excel',\n",
+ " 'edward',\n",
+ " 'dumb',\n",
+ " 'twin',\n",
+ " 'rabbitport',\n",
+ " 'charact',\n",
+ " 'tuna',\n",
+ " 'sandwich',\n",
+ " 'done',\n",
+ " 'mayson',\n",
+ " 'youu',\n",
+ " 'eu',\n",
+ " 'queria',\n",
+ " 'que',\n",
+ " 'era',\n",
+ " 'rewind',\n",
+ " 'fo',\n",
+ " 'pr',\n",
+ " 'ximo',\n",
+ " 'singl',\n",
+ " 'ou',\n",
+ " 'without',\n",
+ " 'worst',\n",
+ " 'ever',\n",
+ " 'fml',\n",
+ " 'fashion',\n",
+ " 'statement',\n",
+ " 'head',\n",
+ " 'scarf',\n",
+ " 'style',\n",
+ " 'doin',\n",
+ " 'mama',\n",
+ " 'yoyoemma',\n",
+ " 'ceekaigax',\n",
+ " 'away',\n",
+ " 'fulli',\n",
+ " 'woken',\n",
+ " 'winterchick',\n",
+ " 'nah',\n",
+ " 'write',\n",
+ " 'wrote',\n",
+ " 'basic',\n",
+ " 'idea',\n",
+ " 'breakfast',\n",
+ " 'sat',\n",
+ " 'zach',\n",
+ " 'ashleeeyyyyy',\n",
+ " 'tushsharma',\n",
+ " 'uh',\n",
+ " 'huh',\n",
+ " 'fellow',\n",
+ " 'unit',\n",
+ " 'cooki',\n",
+ " 'mood',\n",
+ " 'bos',\n",
+ " 'bitch',\n",
+ " 'endlessli',\n",
+ " 'cre',\n",
+ " 'tvdirektr',\n",
+ " 'wow',\n",
+ " 'nurseju',\n",
+ " 'earli',\n",
+ " 'taxi',\n",
+ " 'worth',\n",
+ " 'give',\n",
+ " 'beamer',\n",
+ " 'washhhh',\n",
+ " 'mamzellef',\n",
+ " 'dad',\n",
+ " 'tallk',\n",
+ " 'fix',\n",
+ " 'weareleet',\n",
+ " 'hungov',\n",
+ " 'greasi',\n",
+ " 'noth',\n",
+ " 'help',\n",
+ " 'keep',\n",
+ " 'run',\n",
+ " 'truth',\n",
+ " 'hw',\n",
+ " 'may',\n",
+ " 'bother',\n",
+ " 'meraki',\n",
+ " 'blanket',\n",
+ " 'san',\n",
+ " 'fran',\n",
+ " 'free',\n",
+ " 'wi',\n",
+ " 'fi',\n",
+ " 'news',\n",
+ " 'sheilafightseb',\n",
+ " 'sheila',\n",
+ " 'riddl',\n",
+ " 'horni',\n",
+ " 'kitti',\n",
+ " 'funni',\n",
+ " 'accur',\n",
+ " 'pest',\n",
+ " 'whiteplum',\n",
+ " 'inde',\n",
+ " 'bet',\n",
+ " 'dog',\n",
+ " 'bun',\n",
+ " 'sea',\n",
+ " 'garbag',\n",
+ " 'truck',\n",
+ " 'guy',\n",
+ " 'plenti',\n",
+ " 'flashbelt',\n",
+ " 'tue',\n",
+ " 'wen',\n",
+ " 'beacus',\n",
+ " 'ex',\n",
+ " 'girlfrend',\n",
+ " 'plussizemommi',\n",
+ " 'reason',\n",
+ " 'open',\n",
+ " 'found',\n",
+ " 'whew',\n",
+ " 'collabor',\n",
+ " 'admit',\n",
+ " 'aka',\n",
+ " 'kristin',\n",
+ " 'daynaroselli',\n",
+ " 'elev',\n",
+ " 'favorit',\n",
+ " 'danddncgirl',\n",
+ " 'sorri',\n",
+ " 'hear',\n",
+ " 'part',\n",
+ " 'phx',\n",
+ " 'rel',\n",
+ " 'dmosley',\n",
+ " 'goingbto',\n",
+ " 'ohsailor',\n",
+ " 'sri',\n",
+ " 'bb',\n",
+ " 'chrissyjohnson',\n",
+ " 'jacki',\n",
+ " 'fair',\n",
+ " 'tomm',\n",
+ " 'sunday',\n",
+ " 'rachaelxxo',\n",
+ " 'oooh',\n",
+ " 'question',\n",
+ " 'mark',\n",
+ " 'quit',\n",
+ " 'pick',\n",
+ " 'aah',\n",
+ " 'annoy',\n",
+ " 'throat',\n",
+ " 'seandonaho',\n",
+ " 'ad',\n",
+ " 'sergverdi',\n",
+ " 'lesli',\n",
+ " 'vfcst',\n",
+ " 'ouchh',\n",
+ " 'yea',\n",
+ " 'aim',\n",
+ " 'talk',\n",
+ " 'mee',\n",
+ " 'laptop',\n",
+ " 'freak',\n",
+ " 'piss',\n",
+ " 'jerk',\n",
+ " 'heart',\n",
+ " 'yank',\n",
+ " 'bryanlyt',\n",
+ " 'certain',\n",
+ " 'area',\n",
+ " 'utama',\n",
+ " 'kota',\n",
+ " 'dsara',\n",
+ " 'mayhemmil',\n",
+ " 'elli',\n",
+ " 'defin',\n",
+ " 'faction',\n",
+ " 'hell',\n",
+ " 'son',\n",
+ " 'cat',\n",
+ " 'unpack',\n",
+ " 'miklo',\n",
+ " 'victori',\n",
+ " 'park',\n",
+ " 'twilight',\n",
+ " 'lineup',\n",
+ " 'bummer',\n",
+ " 'proud',\n",
+ " 'kill',\n",
+ " 'thursday',\n",
+ " 'aye',\n",
+ " 'xcspeed',\n",
+ " 'south',\n",
+ " 'padr',\n",
+ " 'island',\n",
+ " 'tommorow',\n",
+ " 'schlitterban',\n",
+ " 'ali',\n",
+ " 'davi',\n",
+ " 'lucki',\n",
+ " 'student',\n",
+ " 'teach',\n",
+ " 'point',\n",
+ " 'ncheck',\n",
+ " 'servic',\n",
+ " 'number',\n",
+ " 'gb',\n",
+ " 'gorgeou',\n",
+ " 'weather',\n",
+ " 'bike',\n",
+ " 'london',\n",
+ " 'babi',\n",
+ " 'refil',\n",
+ " 'bttle',\n",
+ " 'gym',\n",
+ " 'fountain',\n",
+ " 'spilt',\n",
+ " 'evrywhr',\n",
+ " 'embarress',\n",
+ " 'controversi',\n",
+ " 'link',\n",
+ " 'privat',\n",
+ " 'wahhh',\n",
+ " 'restart',\n",
+ " 'heheh',\n",
+ " 'jonasbroth',\n",
+ " 'bf',\n",
+ " 'weird',\n",
+ " 'let',\n",
+ " 'updat',\n",
+ " 'follow',\n",
+ " 'deathli',\n",
+ " 'hangov',\n",
+ " 'morn',\n",
+ " 'puke',\n",
+ " 'zorb',\n",
+ " 'buffet',\n",
+ " 'swag',\n",
+ " 'bag',\n",
+ " 'panten',\n",
+ " 'shoppen',\n",
+ " 'toll',\n",
+ " 'un',\n",
+ " 'tina',\n",
+ " 'getroffen',\n",
+ " 'yez',\n",
+ " 'pc',\n",
+ " 'gammeln',\n",
+ " 'chakatsunstreak',\n",
+ " 'small',\n",
+ " 'math',\n",
+ " 'yes',\n",
+ " 'horribl',\n",
+ " 'excit',\n",
+ " 'microsoft',\n",
+ " 'confer',\n",
+ " 'late',\n",
+ " 'awesom',\n",
+ " 'afro',\n",
+ " 'ffxiii',\n",
+ " 'w',\n",
+ " 'shelley',\n",
+ " 'airport',\n",
+ " 'nephew',\n",
+ " 'kat',\n",
+ " 'dentist',\n",
+ " 'brace',\n",
+ " 'monicaa',\n",
+ " 'sengupta',\n",
+ " 'krist',\n",
+ " 'ph',\n",
+ " 'r',\n",
+ " 'repeat',\n",
+ " 'parentstud',\n",
+ " 'marcolaureano',\n",
+ " 'welcom',\n",
+ " 'non',\n",
+ " 'stop',\n",
+ " 'tweet',\n",
+ " 'johnkuan',\n",
+ " 'pronaz',\n",
+ " 'raj',\n",
+ " 'lebron',\n",
+ " 'v',\n",
+ " 'kobe',\n",
+ " 'least',\n",
+ " 'yanke',\n",
+ " 'gah',\n",
+ " 'mimic',\n",
+ " 'ubisoft',\n",
+ " 'littl',\n",
+ " 'orient',\n",
+ " 'eminem',\n",
+ " 'track',\n",
+ " 'formula',\n",
+ " 'seriou',\n",
+ " 'arf',\n",
+ " 'guinea',\n",
+ " 'pig',\n",
+ " 'dine',\n",
+ " 'noisi',\n",
+ " 'doggi',\n",
+ " 'ryke',\n",
+ " 'whether',\n",
+ " 'includ',\n",
+ " 'straight',\n",
+ " 'latest',\n",
+ " 'report',\n",
+ " 'surviv',\n",
+ " 'recess',\n",
+ " 'ask',\n",
+ " 'www',\n",
+ " 'bgacceler',\n",
+ " 'episod',\n",
+ " 'simpson',\n",
+ " 'pollinatewildli',\n",
+ " 'catalyt',\n",
+ " 'convert',\n",
+ " 'broken',\n",
+ " 'cost',\n",
+ " 'sold',\n",
+ " 'max',\n",
+ " 'settin',\n",
+ " 'websit',\n",
+ " 'whoa',\n",
+ " 'ethansuple',\n",
+ " 'boy',\n",
+ " 'freakin',\n",
+ " 'strong',\n",
+ " 'weakest',\n",
+ " 'rorzshach',\n",
+ " 'devon',\n",
+ " 'hahaha',\n",
+ " 'freed',\n",
+ " 'prison',\n",
+ " 'ah',\n",
+ " 'freedom',\n",
+ " 'princesssuperc',\n",
+ " 'shoot',\n",
+ " 'delboy',\n",
+ " 'promot',\n",
+ " 'doe',\n",
+ " 'lindasmith',\n",
+ " 'pharmaci',\n",
+ " 'dalla',\n",
+ " 'hawaii',\n",
+ " 'friday',\n",
+ " 'cowboyhazel',\n",
+ " 'ok',\n",
+ " 'tip',\n",
+ " 'plurk',\n",
+ " 'hellasia',\n",
+ " 'stu',\n",
+ " 'gg',\n",
+ " 'young',\n",
+ " 'world',\n",
+ " 'foot',\n",
+ " 'killin',\n",
+ " 'name',\n",
+ " 'better',\n",
+ " 'ahead',\n",
+ " 'ashleebiscuit',\n",
+ " 'fell',\n",
+ " 'fring',\n",
+ " 'hang',\n",
+ " 'puppi',\n",
+ " 'pakc',\n",
+ " 'texa',\n",
+ " 'mission',\n",
+ " 'jessemccartney',\n",
+ " 'ughhh',\n",
+ " 'c',\n",
+ " 'sooo',\n",
+ " 'longer',\n",
+ " 'term',\n",
+ " 'trick',\n",
+ " 'jennybdesign',\n",
+ " 'correct',\n",
+ " 'goe',\n",
+ " 'public',\n",
+ " 'lottaburg',\n",
+ " 'jam',\n",
+ " 'fireflight',\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "cfdist = nltk.FreqDist()\n",
+ "\n",
+ "for tweet in sample['text_processed']:\n",
+ " for word in tweet:\n",
+ " cfdist[word] += 1\n",
+ "\n",
+ "top_words = list(cfdist.keys())[:5000]\n",
+ "top_words"
]
},
{
@@ -167,11 +1469,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20000\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
+ "def find_features(document):\n",
+ " words = set(document)\n",
+ " features = {}\n",
+ " for w in top_words:\n",
+ " features[w] = (w in words)\n",
+ " \n",
+ " return features\n",
+ " \n",
+ "feature_sets = [(find_features(tweet), target) for (tweet, target) in list(zip(sample['text_processed'], sample['target']))]\n",
+ "print(len(feature_sets))"
]
},
{
@@ -210,11 +1529,12 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "train_set, test_set = feature_sets[:10000], feature_sets[10000:]\n",
+ "classifier = nltk.NaiveBayesClassifier.train(train_set)"
]
},
{
@@ -230,11 +1550,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.7162\n",
+ "Most Informative Features\n",
+ " sad = True 0 : 1 = 22.1 : 1.0\n",
+ " sick = True 0 : 1 = 16.5 : 1.0\n",
+ " headach = True 0 : 1 = 15.8 : 1.0\n",
+ " bum = True 0 : 1 = 13.3 : 1.0\n",
+ " hospit = True 0 : 1 = 12.7 : 1.0\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
+ "print(nltk.classify.accuracy(classifier, test_set))\n",
+ "classifier.show_most_informative_features(5)"
]
},
{
@@ -298,7 +1633,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -312,7 +1647,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.10.4"
}
},
"nbformat": 4,