ta-data-lis · Bruno2391 · Feb 26, 2024
diff --git a/your-code/challenge-1.ipynb → your-code/[challenge-1] Bruno.ipynb b/your-code/challenge-1.ipynb → your-code/[challenge-1] Bruno.ipynb
@@ -66,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -82,6 +82,95 @@
     "    \"\"\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import nltk \n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.corpus import stopwords\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "#from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from nltk.tokenize import word_tokenize \n",
+    "from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     C:\\Users\\btdjf\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to\n",
+      "[nltk_data]     C:\\Users\\btdjf\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     C:\\Users\\btdjf\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package omw-1.4 to\n",
+      "[nltk_data]     C:\\Users\\btdjf\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('punkt') \n",
+    "nltk.download('omw-1.4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ironhack s q website is\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "text = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\"\n",
+    "\n",
+    "def clean_up(text):\n",
+    "    # removing URLs\n",
+    "    text = re.sub(r\"http\\S+|www\\S+|https\\S+\", \"\", text)\n",
+    "    # removing special characters\n",
+    "    text = re.sub(r\"[^a-zA-Z ]+\", \" \", text)\n",
+    "    # converting to lowercase\n",
+    "    text = text.lower()\n",
+    "    # Remove extra whitespaces\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    return text.strip()\n",
+    "\n",
+    "print(clean_up(text))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -101,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -117,6 +206,28 @@
     "    \"\"\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'website', 'is']\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"ironhack s q website is\"\n",
+    "\n",
+    "def tokenize(text):\n",
+    "    return word_tokenize(text)\n",
+    "\n",
+    "print(tokenize(text))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -145,7 +256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -161,6 +272,37 @@
     "    \"\"\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'websit', 'is']\n"
+     ]
+    }
+   ],
+   "source": [
+    "words = ['ironhack', 's', 'q', 'website', 'is']\n",
+    "\n",
+    "def stem_and_lemmatize(words):\n",
+    "    stemmer = PorterStemmer()\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    stemmed_and_lemmatized = []\n",
+    "\n",
+    "    for word in words:\n",
+    "        stemmed_word = stemmer.stem(word)\n",
+    "        lemmatized_word = lemmatizer.lemmatize(stemmed_word)\n",
+    "        stemmed_and_lemmatized.append((lemmatized_word))\n",
+    "\n",
+    "    return stemmed_and_lemmatized\n",
+    "\n",
+    "print(stem_and_lemmatize(words))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -176,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -192,6 +334,30 @@
     "    \"\"\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 'websit']\n"
+     ]
+    }
+   ],
+   "source": [
+    "words = ['ironhack', 's', 'q', 'websit', 'is']\n",
+    "\n",
+    "def remove_stopwords(words):\n",
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    filtered_words = [word for word in words if word not in stop_words and len(word) > 1]\n",
+    "    return filtered_words\n",
+    "\n",
+    "print(remove_stopwords(words))\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -204,7 +370,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -218,7 +384,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,