ta-data-lis · jraramhoej · Sep 26, 2021
diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
@@ -66,9 +66,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s q website is'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def clean_up(s):\n",
     "    \"\"\"\n",
@@ -79,7 +90,19 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    import re\n",
+    "    \n",
+    "    # remove URLs\n",
+    "    s = re.sub(r'http.+', '', s)\n",
+    "    \n",
+    "    # remove special characters\n",
+    "    s = re.sub(r'[^a-zA-Z\\s]+', ' ', s)\n",
+    "    \n",
+    "    return \" \".join([word for word in s.lower().split()])\n",
+    "    \n",
+    "\n",
+    "clean_up(\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\")"
    ]
   },
   {
@@ -101,10 +124,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     /Users/rickardramhoj/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "# import nltk and tokenizer\n",
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "nltk.download(\"punkt\")\n",
+    "\n",
     "def tokenize(s):\n",
     "    \"\"\"\n",
     "    Tokenize a string.\n",
@@ -114,7 +162,12 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    return [word.lower() for word in word_tokenize(s) if word.isalpha()]\n",
+    "\n",
+    "# test it\n",
+    "tokenize(\"ironhack s q website is\")"
    ]
   },
   {
@@ -145,9 +198,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to\n",
+      "[nltk_data]     /Users/rickardramhoj/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'websit', 'is']"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def stem_and_lemmatize(l):\n",
     "    \"\"\"\n",
@@ -158,7 +231,33 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    # import libraries\n",
+    "    from nltk.stem import PorterStemmer\n",
+    "    nltk.download(\"wordnet\")\n",
+    "    from nltk.stem import WordNetLemmatizer\n",
+    "    \n",
+    "    #initiate stemmer \n",
+    "    stemmer = PorterStemmer()\n",
+    "    \n",
+    "    # initiate lemmatizer\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    \n",
+    "    # define list\n",
+    "    new_list = []\n",
+    "    \n",
+    "    # loop words and append lemmatized and stemmed\n",
+    "    for word in l:\n",
+    "        lemmatized = lemmatizer.lemmatize(word)\n",
+    "        new_list.append(stemmer.stem(lemmatized))\n",
+    "        #new_list.append({\"stemmed\": stemmer.stem(word), \"lemmatized\": lemmatizer.lemmatize(word)})\n",
+    "        \n",
+    "    return new_list\n",
+    "        \n",
+    "# test it\n",
+    "stem_and_lemmatize(['ironhack', 's', 'q', 'website', 'is'])\n",
+    "        "
    ]
   },
   {
@@ -176,9 +275,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /Users/rickardramhoj/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 'q', 'websit']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def remove_stopwords(l):\n",
     "    \"\"\"\n",
@@ -189,7 +308,18 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    # import stopwords\n",
+    "    nltk.download(\"stopwords\")\n",
+    "    from nltk.corpus import stopwords\n",
+    "    \n",
+    "    return [word for word in l if word not in stopwords.words()]\n",
+    "\n",
+    "# test it\n",
+    "remove_stopwords(['ironhack', 's', 'q', 'websit', 'is'])\n",
+    "    \n",
+    "    "
    ]
   },
   {
@@ -204,7 +334,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -218,7 +348,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.6"
   }
  },
  "nbformat": 4,