Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 144 additions & 14 deletions your-code/challenge-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'ironhack s q website is'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def clean_up(s):\n",
" \"\"\"\n",
Expand All @@ -79,7 +90,19 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
" \"\"\""
" \"\"\"\n",
" import re\n",
" \n",
" # remove URLs\n",
" s = re.sub(r'http.+', '', s)\n",
" \n",
" # remove special characters\n",
" s = re.sub(r'[^a-zA-Z\\s]+', ' ', s)\n",
" \n",
" return \" \".join([word for word in s.lower().split()])\n",
" \n",
"\n",
"clean_up(\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\")"
]
},
{
Expand All @@ -101,10 +124,35 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] /Users/rickardramhoj/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"['ironhack', 's', 'q', 'website', 'is']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import nltk and tokenizer\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"nltk.download(\"punkt\")\n",
"\n",
"def tokenize(s):\n",
" \"\"\"\n",
" Tokenize a string.\n",
Expand All @@ -114,7 +162,12 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
" \"\"\""
" \"\"\"\n",
" \n",
" return [word.lower() for word in word_tokenize(s) if word.isalpha()]\n",
"\n",
"# test it\n",
"tokenize(\"ironhack s q website is\")"
]
},
{
Expand Down Expand Up @@ -145,9 +198,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /Users/rickardramhoj/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"['ironhack', 's', 'q', 'websit', 'is']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def stem_and_lemmatize(l):\n",
" \"\"\"\n",
Expand All @@ -158,7 +231,33 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
" \"\"\""
" \"\"\"\n",
" \n",
" # import libraries\n",
" from nltk.stem import PorterStemmer\n",
" nltk.download(\"wordnet\")\n",
" from nltk.stem import WordNetLemmatizer\n",
" \n",
" #initiate stemmer \n",
" stemmer = PorterStemmer()\n",
" \n",
" # initiate lemmatizer\n",
" lemmatizer = WordNetLemmatizer()\n",
" \n",
" # define list\n",
" new_list = []\n",
" \n",
" # loop words and append lemmatized and stemmed\n",
" for word in l:\n",
" lemmatized = lemmatizer.lemmatize(word)\n",
" new_list.append(stemmer.stem(lemmatized))\n",
" #new_list.append({\"stemmed\": stemmer.stem(word), \"lemmatized\": lemmatizer.lemmatize(word)})\n",
" \n",
" return new_list\n",
" \n",
"# test it\n",
"stem_and_lemmatize(['ironhack', 's', 'q', 'website', 'is'])\n",
" "
]
},
{
Expand All @@ -176,9 +275,29 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/rickardramhoj/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"['ironhack', 'q', 'websit']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def remove_stopwords(l):\n",
" \"\"\"\n",
Expand All @@ -189,7 +308,18 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
" \"\"\""
" \"\"\"\n",
" \n",
" # import stopwords\n",
" nltk.download(\"stopwords\")\n",
" from nltk.corpus import stopwords\n",
" \n",
" return [word for word in l if word not in stopwords.words()]\n",
"\n",
"# test it\n",
"remove_stopwords(['ironhack', 's', 'q', 'websit', 'is'])\n",
" \n",
" "
]
},
{
Expand All @@ -204,7 +334,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -218,7 +348,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.7.6"
}
},
"nbformat": 4,
Expand Down
Loading