diff --git a/your-code/__pycache__/cleaning_functions.cpython-38.pyc b/your-code/__pycache__/cleaning_functions.cpython-38.pyc new file mode 100644 index 0000000..f4edf08 Binary files /dev/null and b/your-code/__pycache__/cleaning_functions.cpython-38.pyc differ diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..d68ad50 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,11 +66,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "' s Q website is ironhack '" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], "source": [ + "import re\n", + "\n", + "string = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\"\n", + "\n", "def clean_up(s):\n", + " rules_list= ['@\\w+','[^a-z, A-Z]', 'http', 'com']\n", " \"\"\"\n", " Cleans up numbers, URLs, and special characters from a string.\n", "\n", @@ -79,7 +95,12 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " for rule in rules_list:\n", + " s = re.sub(rule,' ',s)\n", + " return s\n", + "cleaned = clean_up(string)\n", + "cleaned" ] }, { @@ -101,10 +122,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['s', 'Q', 'website', 'is', 'ironhack']" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], "source": [ + "import nltk\n", + "\n", "def tokenize(s):\n", " \"\"\"\n", " Tokenize a string.\n", @@ -114,7 +148,10 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "token = tokenize(cleaned)\n", + "token" ] }, { @@ -145,12 +182,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer\n", + "'''nltk.download('wordnet')\n", + "'''\n", + "\n", + "\n", "def stem_and_lemmatize(l):\n", - " \"\"\"\n", + " lemmatizer = WordNetLemmatizer()\n", + " stemporter = PorterStemmer()\n", + "\n", + " \"\"\"=\n", " Perform stemming and lemmatization on a list of words.\n", "\n", " Args:\n", @@ -158,7 +204,37 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + " stemandlemmalist=[]\n", + " \n", + "\n", + " for word in l:\n", + " stemmed = stemporter.stem(word)\n", + " stemandlemma = lemmatizer.lemmatize(stemmed)\n", + " stemandlemmalist.append(stemandlemma)\n", + " return stemandlemmalist\n", + "\n", + "stemandlemma = stem_and_lemmatize(token)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['s', 'q', 'websit', 'is', 'ironhack']" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "stemandlemma" ] }, { @@ -176,10 +252,32 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to\n[nltk_data] C:\\Users\\Sebas!\\AppData\\Roaming\\nltk_data...\n[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['q', 'websit', 'ironhack']" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], "source": [ + "from nltk.corpus import stopwords\n", + "'''nltk.download('stopwords')\n", + "'''\n", + "\n", "def remove_stopwords(l):\n", " \"\"\"\n", " Remove English stopwords from a list of strings.\n", @@ -189,7 +287,16 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " stop_words = set(stopwords.words('english'))\n", + " \n", + " for words in l:\n", + " if words in stop_words:\n", + " l.remove(words)\n", + " return l\n", + "\n", + "filtered = remove_stopwords(stemed)\n", + "filtered" ] }, { @@ -204,9 +311,8 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python385jvsc74a57bd09efc80705562ef6f8028ba9c07828938c290468cbec0ebcf2b44f68ee94d478d", + "display_name": "Python 3.8.5 64-bit" }, "language_info": { "codemirror_mode": { @@ -218,9 +324,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..00e4a35 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -18,8 +18,8 @@ "\n", "```python\n", ">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", - ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. ", - " ", + ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. \n", + " \n", "Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n", ">>> analyzer = SentimentIntensityAnalyzer()\n", ">>> analyzer.polarity_scores(txt)\n", @@ -46,11 +46,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "import pandas as pd \n", + "\n", + "df = pd.read_csv(r'C:/Users/Sebas!/Documents/GitHub/sentiments.csv',encoding='latin-1')\n", + "df.columns= ['target','id','date','flag','user','text']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\"is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!\"" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "df['text'][0]" ] }, { @@ -76,11 +100,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "from cleaning_functions import clean_up, tokenize, stem_and_lemmatize, remove_stopwords\n", + "\n", + "sample = df.sample(n=25000, replace=False, random_state=3)\n", + "\n", + "txt_processed = []\n", + "\n", + "for text in sample['text']:\n", + " cleaned = clean_up(text)\n", + " token = tokenize(cleaned)\n", + " stemAndlemma = stem_and_lemmatize(token)\n", + " filtered = remove_stopwords(stemAndlemma) \n", + " txt_processed.append(filtered)\n", + "\n", + "sample['text_processed'] = txt_processed\n", + "\n" ] }, { @@ -98,11 +137,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "from nltk.probability import ConditionalFreqDist\n", + "import nltk\n", + "import random\n", + "\n", + "all_words = []\n", + "for lst in sample['text_processed']:\n", + " for word in lst:\n", + " all_words.append(word)\n", + "\n", + "all_words = nltk.FreqDist(all_words)\n", + "\n", + "\n", + "bag_words = list(all_words.keys())[:5000]\n" ] }, { @@ -167,11 +219,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " target id date flag \\\n", + "1164659 4 1979779549 Sun May 31 04:28:28 PDT 2009 NO_QUERY \n", + "228999 0 1978383347 Sat May 30 23:11:37 PDT 2009 NO_QUERY \n", + "1502596 4 2071730438 Sun Jun 07 19:50:23 PDT 2009 NO_QUERY \n", + "950813 4 1824089618 Sun May 17 00:00:32 PDT 2009 NO_QUERY \n", + "1258437 4 1997910291 Mon Jun 01 17:40:09 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "1023092 4 1882933753 Fri May 22 07:44:00 PDT 2009 NO_QUERY \n", + "1398541 4 2054113019 Sat Jun 06 06:35:23 PDT 2009 NO_QUERY \n", + "1458039 4 2063635277 Sun Jun 07 03:28:05 PDT 2009 NO_QUERY \n", + "217219 0 1975887217 Sat May 30 16:07:01 PDT 2009 NO_QUERY \n", + "1040766 4 1956991914 Thu May 28 23:13:10 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "1164659 MarcyChen Informal study shows that my followers are coo... \n", + "228999 emizell i lost 2 followers...what did i do wrong?? \n", + "1502596 Jen_Kirby Excited that I get to go into work late tomorrow \n", + "950813 Zombulator @neolee23 Sorry for my late reply! I saw it on... \n", + "1258437 babybee3 bow check a wow wow! i had a good day!! mike t... \n", + "... ... ... \n", + "1023092 YllwCkeNoFrstng @tomogirl79 haha I love how you're surprised t... \n", + "1398541 pointee immm doing good! meeting taylon / pablo at the... \n", + "1458039 anarmnetwork @chempaka otw? hati2.. \n", + "217219 finnern You know that your kid is not feeling well, wh... \n", + "1040766 whosmarisa I like loading up the car in the middle of the... \n", + "\n", + " text_processed \n", + "1164659 [inform, studi, show, my, follow, cooler, foll... \n", + "228999 [lost, follow, did, do, wrong] \n", + "1502596 [excit, i, get, go, work, late, tomorrow] \n", + "950813 [sorri, my, late, repli, saw, on, youtub, didn... \n", + "1258437 [bow, check, wow, wow, had, good, day, mike, h... \n", + "... ... \n", + "1023092 [haha, love, you, surpris, you, rub, on] \n", + "1398541 [immm, good, meet, taylon, pablo, the, park, t... \n", + "1458039 [otw, hati] \n", + "217219 [know, your, kid, not, feel, well, ,, she, t, ... \n", + "1040766 [like, load, the, car, the, middl, the, night,... \n", + "\n", + "[25000 rows x 7 columns]" + ], + "text/html": "
| \n | target | \nid | \ndate | \nflag | \nuser | \ntext | \ntext_processed | \n
|---|---|---|---|---|---|---|---|
| 1164659 | \n4 | \n1979779549 | \nSun May 31 04:28:28 PDT 2009 | \nNO_QUERY | \nMarcyChen | \nInformal study shows that my followers are coo... | \n[inform, studi, show, my, follow, cooler, foll... | \n
| 228999 | \n0 | \n1978383347 | \nSat May 30 23:11:37 PDT 2009 | \nNO_QUERY | \nemizell | \ni lost 2 followers...what did i do wrong?? | \n[lost, follow, did, do, wrong] | \n
| 1502596 | \n4 | \n2071730438 | \nSun Jun 07 19:50:23 PDT 2009 | \nNO_QUERY | \nJen_Kirby | \nExcited that I get to go into work late tomorrow | \n[excit, i, get, go, work, late, tomorrow] | \n
| 950813 | \n4 | \n1824089618 | \nSun May 17 00:00:32 PDT 2009 | \nNO_QUERY | \nZombulator | \n@neolee23 Sorry for my late reply! I saw it on... | \n[sorri, my, late, repli, saw, on, youtub, didn... | \n
| 1258437 | \n4 | \n1997910291 | \nMon Jun 01 17:40:09 PDT 2009 | \nNO_QUERY | \nbabybee3 | \nbow check a wow wow! i had a good day!! mike t... | \n[bow, check, wow, wow, had, good, day, mike, h... | \n
| ... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
| 1023092 | \n4 | \n1882933753 | \nFri May 22 07:44:00 PDT 2009 | \nNO_QUERY | \nYllwCkeNoFrstng | \n@tomogirl79 haha I love how you're surprised t... | \n[haha, love, you, surpris, you, rub, on] | \n
| 1398541 | \n4 | \n2054113019 | \nSat Jun 06 06:35:23 PDT 2009 | \nNO_QUERY | \npointee | \nimmm doing good! meeting taylon / pablo at the... | \n[immm, good, meet, taylon, pablo, the, park, t... | \n
| 1458039 | \n4 | \n2063635277 | \nSun Jun 07 03:28:05 PDT 2009 | \nNO_QUERY | \nanarmnetwork | \n@chempaka otw? hati2.. | \n[otw, hati] | \n
| 217219 | \n0 | \n1975887217 | \nSat May 30 16:07:01 PDT 2009 | \nNO_QUERY | \nfinnern | \nYou know that your kid is not feeling well, wh... | \n[know, your, kid, not, feel, well, ,, she, t, ... | \n
| 1040766 | \n4 | \n1956991914 | \nThu May 28 23:13:10 PDT 2009 | \nNO_QUERY | \nwhosmarisa | \nI like loading up the car in the middle of the... | \n[like, load, the, car, the, middl, the, night,... | \n
25000 rows × 7 columns
\n