Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 128 additions & 19 deletions your-code/challenge-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,28 @@
"* Write the functions you will use in Challenge 3 for cleaning, tokenizing, stemming, and lemmatizing data."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"import nltk \n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.tokenize import sent_tokenize\n",
"from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.corpus import wordnet\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -66,20 +88,45 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'ironhack s q website is'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def clean_up(s):\n",
"def clean_up(string):\n",
" \"\"\"\n",
" Cleans up numbers, URLs, and special characters from a string.\n",
"\n",
" Args:\n",
" s: The string to be cleaned up.\n",
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
" \"\"\""
" A cleaned-up string.\n",
" \"\"\"\n",
" text = s.lower() # Convert to lowercase\n",
" text = re.sub(r'\\d+', ' ', text) # Remove numbers\n",
" text = re.sub(r'http\\S+', ' ', text) # Remove URLs\n",
" text = re.sub(r'[^\\w\\s]', ' ', text) # Remove special characters (except spaces)\n",
" \n",
" # Remove extra spaces\n",
" text = re.sub(r'\\s+', ' ', text).strip()\n",
"\n",
" return text\n",
"\n",
"string = r\"\"\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")\"\"\"\n",
"\n",
"cleaned_string = clean_up(string)\n",
"cleaned_string"
]
},
{
Expand All @@ -101,11 +148,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['ironhack', 's', 'q', 'website', 'is']"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def tokenize(s):\n",
"def tokenize(cleaned_string):\n",
" \"\"\"\n",
" Tokenize a string.\n",
"\n",
Expand All @@ -114,7 +172,14 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
" \"\"\""
" \"\"\"\n",
" tokens = word_tokenize(cleaned_string)\n",
" tokens = [word for word in tokens if word.isalnum()]\n",
" \n",
" return tokens\n",
"\n",
"string_tokens = tokenize(cleaned_string)\n",
"string_tokens"
]
},
{
Expand Down Expand Up @@ -145,11 +210,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['ironhack---->ironhack',\n",
" 's---->s',\n",
" 'q---->q',\n",
" 'website---->website',\n",
" 'is---->is']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def stem_and_lemmatize(l):\n",
"def stem_and_lemmatize(string_tokens):\n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
Expand All @@ -158,7 +238,14 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
" \"\"\""
" \"\"\"\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized = [word +\"---->\" + lemmatizer.lemmatize(word) for word in string_tokens]\n",
" \n",
" return lemmatized\n",
"\n",
"lemmatized = stem_and_lemmatize(string_tokens)\n",
"lemmatized"
]
},
{
Expand All @@ -176,11 +263,26 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['ironhack---->ironhack',\n",
" 's---->s',\n",
" 'q---->q',\n",
" 'website---->website',\n",
" 'is---->is']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def remove_stopwords(l):\n",
"def remove_stopwords(lemmatized):\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
"\n",
Expand All @@ -189,7 +291,14 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
" \"\"\""
" \"\"\"\n",
" stop_words = set(stopwords.words('english'))\n",
" filtered_tokens = [word for word in lemmatized if word not in stop_words]\n",
" \n",
" return filtered_tokens\n",
"\n",
"filtered_tokens = remove_stopwords(lemmatized)\n",
"filtered_tokens"
]
},
{
Expand All @@ -204,7 +313,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -218,7 +327,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
Loading