ta-data-lis · FranciscoBarreto95 · Aug 8, 2022
diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
@@ -66,9 +66,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import re \n",
+    "import nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s q website is'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def clean_up(s):\n",
     "    \"\"\"\n",
@@ -79,7 +100,14 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    string= re.sub(r'http\\S+','',s)\n",
+    "    return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+    "    \n",
+    "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n",
+    "\n",
+    "test_string = clean_up(test)\n",
+    "test_string"
    ]
   },
   {
@@ -101,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,7 +142,29 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    return nltk.word_tokenize(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokens = tokenize(test_string)\n",
+    "tokens"
    ]
   },
   {
@@ -145,7 +195,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.stem import PorterStemmer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -158,7 +218,37 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    ps = nltk.PorterStemmer()\n",
+    "    lemmatizer = nltk.WordNetLemmatizer()\n",
+    "    l2 = []\n",
+    "    \n",
+    "    for w in l:\n",
+    "        s = ps.stem(w)\n",
+    "        s = lemmatizer.lemmatize(s)\n",
+    "        l2 += [s]\n",
+    "    \n",
+    "    return l2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'websit', 'is']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stem_and_lemmatize(tokens)"
    ]
   },
   {
@@ -176,10 +266,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "r n h c k     q   w e b e  \n"
+     ]
+    }
+   ],
    "source": [
+    "from nltk.corpus import stopwords\n",
     "def remove_stopwords(l):\n",
     "    \"\"\"\n",
     "    Remove English stopwords from a list of strings.\n",
@@ -189,9 +288,21 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    stop_words = stopwords.words('english')\n",
+    "\n",
+    "    return ' '.join([w for w in l if w not in stop_words])\n",
+    "\n",
+    "print(remove_stopwords(test_string))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -204,7 +315,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.9.12 ('base')",
    "language": "python",
    "name": "python3"
   },
@@ -218,7 +329,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.9.12"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
+   }
   }
  },
  "nbformat": 4,