diff --git a/your-code/__pycache__/cleaning_functions.cpython-38.pyc b/your-code/__pycache__/cleaning_functions.cpython-38.pyc new file mode 100644 index 0000000..f4edf08 Binary files /dev/null and b/your-code/__pycache__/cleaning_functions.cpython-38.pyc differ diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..d68ad50 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,11 +66,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "' s Q website is ironhack '" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], "source": [ + "import re\n", + "\n", + "string = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\"\n", + "\n", "def clean_up(s):\n", + " rules_list= ['@\\w+','[^a-z, A-Z]', 'http', 'com']\n", " \"\"\"\n", " Cleans up numbers, URLs, and special characters from a string.\n", "\n", @@ -79,7 +95,12 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " for rule in rules_list:\n", + " s = re.sub(rule,' ',s)\n", + " return s\n", + "cleaned = clean_up(string)\n", + "cleaned" ] }, { @@ -101,10 +122,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['s', 'Q', 'website', 'is', 'ironhack']" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], "source": [ + "import nltk\n", + "\n", "def tokenize(s):\n", " \"\"\"\n", " Tokenize a string.\n", @@ -114,7 +148,10 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "token = tokenize(cleaned)\n", + "token" ] }, { @@ -145,12 +182,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer\n", + "'''nltk.download('wordnet')\n", + "'''\n", + "\n", + "\n", "def stem_and_lemmatize(l):\n", - " \"\"\"\n", + " lemmatizer = WordNetLemmatizer()\n", + " stemporter = PorterStemmer()\n", + "\n", + " \"\"\"=\n", " Perform stemming and lemmatization on a list of words.\n", "\n", " Args:\n", @@ -158,7 +204,37 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + " stemandlemmalist=[]\n", + " \n", + "\n", + " for word in l:\n", + " stemmed = stemporter.stem(word)\n", + " stemandlemma = lemmatizer.lemmatize(stemmed)\n", + " stemandlemmalist.append(stemandlemma)\n", + " return stemandlemmalist\n", + "\n", + "stemandlemma = stem_and_lemmatize(token)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['s', 'q', 'websit', 'is', 'ironhack']" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "stemandlemma" ] }, { @@ -176,10 +252,32 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to\n[nltk_data] C:\\Users\\Sebas!\\AppData\\Roaming\\nltk_data...\n[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['q', 'websit', 'ironhack']" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], "source": [ + "from nltk.corpus import stopwords\n", + "'''nltk.download('stopwords')\n", + "'''\n", + "\n", "def remove_stopwords(l):\n", " \"\"\"\n", " Remove English stopwords from a list of strings.\n", @@ -189,7 +287,16 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " stop_words = set(stopwords.words('english'))\n", + " \n", + " for words in l:\n", + " if words in stop_words:\n", + " l.remove(words)\n", + " return l\n", + "\n", + "filtered = remove_stopwords(stemed)\n", + "filtered" ] }, { @@ -204,9 +311,8 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python385jvsc74a57bd09efc80705562ef6f8028ba9c07828938c290468cbec0ebcf2b44f68ee94d478d", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { "codemirror_mode": { @@ -218,9 +324,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..00e4a35 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -18,8 +18,8 @@ "\n", "```python\n", ">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", - ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
", - "
", + ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
\n", + "
\n", "Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n", ">>> analyzer = SentimentIntensityAnalyzer()\n", ">>> analyzer.polarity_scores(txt)\n", @@ -46,11 +46,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "import pandas as pd \n", + "\n", + "df = pd.read_csv(r'C:/Users/Sebas!/Documents/GitHub/sentiments.csv',encoding='latin-1')\n", + "df.columns= ['target','id','date','flag','user','text']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\"is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!\"" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "df['text'][0]" ] }, { @@ -76,11 +100,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "from cleaning_functions import clean_up, tokenize, stem_and_lemmatize, remove_stopwords\n", + "\n", + "sample = df.sample(n=25000, replace=False, random_state=3)\n", + "\n", + "txt_processed = []\n", + "\n", + "for text in sample['text']:\n", + " cleaned = clean_up(text)\n", + " token = tokenize(cleaned)\n", + " stemAndlemma = stem_and_lemmatize(token)\n", + " filtered = remove_stopwords(stemAndlemma) \n", + " txt_processed.append(filtered)\n", + "\n", + "sample['text_processed'] = txt_processed\n", + "\n" ] }, { @@ -98,11 +137,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "from nltk.probability import ConditionalFreqDist\n", + "import nltk\n", + "import random\n", + "\n", + "all_words = []\n", + "for lst in sample['text_processed']:\n", + " for word in lst:\n", + " all_words.append(word)\n", + "\n", + "all_words = nltk.FreqDist(all_words)\n", + "\n", + "\n", + "bag_words = list(all_words.keys())[:5000]\n" ] }, { @@ -167,11 +219,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " target id date flag \\\n", + "1164659 4 1979779549 Sun May 31 04:28:28 PDT 2009 NO_QUERY \n", + "228999 0 1978383347 Sat May 30 23:11:37 PDT 2009 NO_QUERY \n", + "1502596 4 2071730438 Sun Jun 07 19:50:23 PDT 2009 NO_QUERY \n", + "950813 4 1824089618 Sun May 17 00:00:32 PDT 2009 NO_QUERY \n", + "1258437 4 1997910291 Mon Jun 01 17:40:09 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "1023092 4 1882933753 Fri May 22 07:44:00 PDT 2009 NO_QUERY \n", + "1398541 4 2054113019 Sat Jun 06 06:35:23 PDT 2009 NO_QUERY \n", + "1458039 4 2063635277 Sun Jun 07 03:28:05 PDT 2009 NO_QUERY \n", + "217219 0 1975887217 Sat May 30 16:07:01 PDT 2009 NO_QUERY \n", + "1040766 4 1956991914 Thu May 28 23:13:10 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "1164659 MarcyChen Informal study shows that my followers are coo... \n", + "228999 emizell i lost 2 followers...what did i do wrong?? \n", + "1502596 Jen_Kirby Excited that I get to go into work late tomorrow \n", + "950813 Zombulator @neolee23 Sorry for my late reply! I saw it on... \n", + "1258437 babybee3 bow check a wow wow! i had a good day!! mike t... \n", + "... ... ... \n", + "1023092 YllwCkeNoFrstng @tomogirl79 haha I love how you're surprised t... \n", + "1398541 pointee immm doing good! meeting taylon / pablo at the... \n", + "1458039 anarmnetwork @chempaka otw? hati2.. \n", + "217219 finnern You know that your kid is not feeling well, wh... \n", + "1040766 whosmarisa I like loading up the car in the middle of the... \n", + "\n", + " text_processed \n", + "1164659 [inform, studi, show, my, follow, cooler, foll... \n", + "228999 [lost, follow, did, do, wrong] \n", + "1502596 [excit, i, get, go, work, late, tomorrow] \n", + "950813 [sorri, my, late, repli, saw, on, youtub, didn... \n", + "1258437 [bow, check, wow, wow, had, good, day, mike, h... \n", + "... ... \n", + "1023092 [haha, love, you, surpris, you, rub, on] \n", + "1398541 [immm, good, meet, taylon, pablo, the, park, t... \n", + "1458039 [otw, hati] \n", + "217219 [know, your, kid, not, feel, well, ,, she, t, ... \n", + "1040766 [like, load, the, car, the, middl, the, night,... \n", + "\n", + "[25000 rows x 7 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
targetiddateflagusertexttext_processed
116465941979779549Sun May 31 04:28:28 PDT 2009NO_QUERYMarcyChenInformal study shows that my followers are coo...[inform, studi, show, my, follow, cooler, foll...
22899901978383347Sat May 30 23:11:37 PDT 2009NO_QUERYemizelli lost 2 followers...what did i do wrong??[lost, follow, did, do, wrong]
150259642071730438Sun Jun 07 19:50:23 PDT 2009NO_QUERYJen_KirbyExcited that I get to go into work late tomorrow[excit, i, get, go, work, late, tomorrow]
95081341824089618Sun May 17 00:00:32 PDT 2009NO_QUERYZombulator@neolee23 Sorry for my late reply! I saw it on...[sorri, my, late, repli, saw, on, youtub, didn...
125843741997910291Mon Jun 01 17:40:09 PDT 2009NO_QUERYbabybee3bow check a wow wow! i had a good day!! mike t...[bow, check, wow, wow, had, good, day, mike, h...
........................
102309241882933753Fri May 22 07:44:00 PDT 2009NO_QUERYYllwCkeNoFrstng@tomogirl79 haha I love how you're surprised t...[haha, love, you, surpris, you, rub, on]
139854142054113019Sat Jun 06 06:35:23 PDT 2009NO_QUERYpointeeimmm doing good! meeting taylon / pablo at the...[immm, good, meet, taylon, pablo, the, park, t...
145803942063635277Sun Jun 07 03:28:05 PDT 2009NO_QUERYanarmnetwork@chempaka otw? hati2..[otw, hati]
21721901975887217Sat May 30 16:07:01 PDT 2009NO_QUERYfinnernYou know that your kid is not feeling well, wh...[know, your, kid, not, feel, well, ,, she, t, ...
104076641956991914Thu May 28 23:13:10 PDT 2009NO_QUERYwhosmarisaI like loading up the car in the middle of the...[like, load, the, car, the, middl, the, night,...
\n

25000 rows × 7 columns

\n
" + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "sample\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "dic_feat = {}\n", + "\n", + "words = set(word for lst in sample['text_processed'] for word in lst)\n", + "\n", + "for word in bag_words:\n", + " dic_feat[word] = (word in words)\n", + "\n", + "features = [({keys:dic_feat[keys]}, dic_feat[keys]) for keys in dic_feat.keys()]" ] }, { @@ -210,11 +333,32 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "training = features[:3000]\n", + "test = features[3000:]\n", + "\n", + "classifier = nltk.NaiveBayesClassifier.train(training)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Most Informative Features\n" + ] + } + ], + "source": [ + "classifier.show_most_informative_features()" ] }, { @@ -230,11 +374,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.0\nMost Informative Features\n" + ] + } + ], "source": [ - "# your code here" + "# your code here\n", + "print(nltk.classify.accuracy(classifier,test))\n", + "classifier.show_most_informative_features(15)" ] }, { @@ -252,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -270,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -288,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -298,9 +452,8 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python385jvsc74a57bd09efc80705562ef6f8028ba9c07828938c290468cbec0ebcf2b44f68ee94d478d", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { "codemirror_mode": { @@ -312,9 +465,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/your-code/cleaning_functions.py b/your-code/cleaning_functions.py new file mode 100644 index 0000000..a2a1a10 --- /dev/null +++ b/your-code/cleaning_functions.py @@ -0,0 +1,72 @@ +import re +import nltk +from nltk.stem import WordNetLemmatizer +from nltk.stem import PorterStemmer +from nltk.corpus import stopwords + + +def clean_up(s): + rules_list= ['@\w+','[^a-z,A-Z]', 'http', 'com','[^a-z,A-Z]'] + """ + Cleans up numbers, URLs, and special characters from a string. + + Args: + s: The string to be cleaned up. + + Returns: + A string that has been cleaned up. + """ + for rule in rules_list: + s = re.sub(rule,' ',s) + return s + +def tokenize(s): + """ + Tokenize a string. + + Args: + s: String to be tokenized. + + Returns: + A list of words as the result of tokenization. + """ + return nltk.word_tokenize(s) + +def stem_and_lemmatize(l): + + lemmatizer = WordNetLemmatizer() + stemporter = PorterStemmer() + + """= + Perform stemming and lemmatization on a list of words. + + Args: + l: A list of strings. + + Returns: + A list of strings after being stemmed and lemmatized. + """ + stemandlemmalist=[] + + for word in l: + stemmed = stemporter.stem(word) + stemandlemma = lemmatizer.lemmatize(stemmed) + stemandlemmalist.append(stemandlemma) + return stemandlemmalist + +def remove_stopwords(l): + """ + Remove English stopwords from a list of strings. + + Args: + l: A list of strings. + + Returns: + A list of strings after stop words are removed. + """ + stop_words = set(stopwords.words('english')) + + for words in l: + if words in stop_words: + l.remove(words) + return l \ No newline at end of file