ta-data-lis · FelipeAvila2 · Jul 21, 2021
diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
@@ -66,10 +66,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " ironhack s  q website   is \n"
+     ]
+    }
+   ],
    "source": [
+    "import re\n",
+    "\n",
     "def clean_up(s):\n",
     "    \"\"\"\n",
     "    Cleans up numbers, URLs, and special characters from a string.\n",
@@ -79,7 +89,14 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    element = re.sub('http://.+', '', s)\n",
+    "    element = re.sub('@|-|#|\\'', ' ', element)\n",
+    "    return (re.sub('\\d+',' ',element)).lower()\n",
+    "    \n",
+    "string = \"\"\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")\"\"\"\n",
+    "print(clean_up(string))"
    ]
   },
   {
@@ -101,10 +118,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     C:\\Users\\DELL\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Unzipping tokenizers\\punkt.zip.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'website', 'is']\n"
+     ]
+    }
+   ],
    "source": [
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "nltk.download('punkt')\n",
+    "\n",
     "def tokenize(s):\n",
     "    \"\"\"\n",
     "    Tokenize a string.\n",
@@ -114,7 +152,13 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "\n",
+    "    return word_tokenize(s)\n",
+    "    \n",
+    "string = clean_up(string)\n",
+    "token = tokenize(string)\n",
+    "print(token)"
    ]
   },
   {
@@ -145,10 +189,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 41,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to\n",
+      "[nltk_data]     C:\\Users\\DELL\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
+      "[nltk_data]     C:\\Users\\DELL\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
+      "[nltk_data]       date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['the',\n",
+       " 'wonderful',\n",
+       " 'tokenization',\n",
+       " 'be',\n",
+       " 'colourfull',\n",
+       " 'in',\n",
+       " 'housekeep',\n",
+       " 'forever']"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.stem import PorterStemmer\n",
+    "nltk.download('wordnet')\n",
+    "from nltk.corpus import wordnet\n",
+    "\n",
+    "nltk.download('averaged_perceptron_tagger')\n",
+    "\n",
+    "def get_wordnet_pos(word):\n",
+    "    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization\n",
+    "    tag_dict = {\"J\": wordnet.ADJ, \n",
+    "                \"N\": wordnet.NOUN,\n",
+    "                \"V\": wordnet.VERB,\n",
+    "                \"R\": wordnet.ADV}\n",
+    "    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist\n",
+    "\n",
+    "\n",
     "def stem_and_lemmatize(l):\n",
     "    \"\"\"\n",
     "    Perform stemming and lemmatization on a list of words.\n",
@@ -158,7 +249,19 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    lem = WordNetLemmatizer()\n",
+    "    lemmatized = [lem.lemmatize(w,get_wordnet_pos(w)) for w in l]\n",
+    "    \n",
+    "    ps = PorterStemmer()\n",
+    "    stemmed = [ps.stem(w) for w in l]\n",
+    "    \n",
+    "    return stemmed, lemmatized\n",
+    "\n",
+    "sentence1, sentence2 = stem_and_lemmatize(['the','wonderful','tokenization','is','colourfull','in','housekeeping','forever'])\n",
+    "\n",
+    "sentence2"
    ]
   },
   {
@@ -176,10 +279,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 42,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     C:\\Users\\DELL\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['wonderful', 'tokenization', 'colourfull', 'housekeep', 'forever']"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "from nltk.corpus import stopwords\n",
+    "nltk.download('stopwords')\n",
+    "\n",
     "def remove_stopwords(l):\n",
     "    \"\"\"\n",
     "    Remove English stopwords from a list of strings.\n",
@@ -189,7 +315,17 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    filtered_sentence = []\n",
+    "    \n",
+    "    for w in l:\n",
+    "        if w not in stopwords.words('english'):\n",
+    "            filtered_sentence.append(w)\n",
+    "    \n",
+    "    return filtered_sentence\n",
+    "\n",
+    "remove_stopwords(sentence2)"
    ]
   },
   {
@@ -218,7 +354,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,