diff --git a/Final.ipynb b/Final.ipynb index a53f418..36b5068 100644 --- a/Final.ipynb +++ b/Final.ipynb @@ -1,991 +1,1345 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "#INCLUDE LIBRARIES\n", - "\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import re\n", - "import itertools\n", - "import nltk\n", - "from nltk.corpus import stopwords\n", - "from nltk.stem import WordNetLemmatizer\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "from nltk import sent_tokenize, word_tokenize\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Hello', 'everyone', '.', 'You', 'are', 'reading', 'NLP', 'article', '.']" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + }, + "colab": { + "name": "Final.ipynb.txt", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true } - ], - "source": [ - "from nltk.tokenize import word_tokenize\n", - "text = \"Hello everyone. You are reading NLP article.\"\n", - "word_tokenize(text)" - ] }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0titletextlabel
08476You Can Smell Hillary’s FearDaniel Greenfield, a Shillman Journalism Fello...FAKE
110294Watch The Exact Moment Paul Ryan Committed Pol...Google Pinterest Digg Linkedin Reddit Stumbleu...FAKE
23608Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Mon...REAL
310142Bernie supporters on Twitter erupt in anger ag...— Kaydee King (@KaydeeKing) November 9, 2016 T...FAKE
4875The Battle of New York: Why This Primary MattersIt's primary day in New York and front-runners...REAL
56903Tehran, USA\\nI’m not an immigrant, but my grandparents ...FAKE
67341Girl Horrified At What She Watches Boyfriend D...Share This Baylee Luciani (left), Screenshot o...FAKE
795‘Britain’s Schindler’ Dies at 106A Czech stockbroker who saved more than 650 Je...REAL
84869Fact check: Trump and Clinton at the 'commande...Hillary Clinton and Donald Trump made some ina...REAL
92909Iran reportedly makes new push for uranium con...Iranian negotiators reportedly have made a las...REAL
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 title \\\n", - "0 8476 You Can Smell Hillary’s Fear \n", - "1 10294 Watch The Exact Moment Paul Ryan Committed Pol... \n", - "2 3608 Kerry to go to Paris in gesture of sympathy \n", - "3 10142 Bernie supporters on Twitter erupt in anger ag... \n", - "4 875 The Battle of New York: Why This Primary Matters \n", - "5 6903 Tehran, USA \n", - "6 7341 Girl Horrified At What She Watches Boyfriend D... \n", - "7 95 ‘Britain’s Schindler’ Dies at 106 \n", - "8 4869 Fact check: Trump and Clinton at the 'commande... \n", - "9 2909 Iran reportedly makes new push for uranium con... \n", - "\n", - " text label \n", - "0 Daniel Greenfield, a Shillman Journalism Fello... FAKE \n", - "1 Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE \n", - "2 U.S. Secretary of State John F. Kerry said Mon... REAL \n", - "3 — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE \n", - "4 It's primary day in New York and front-runners... REAL \n", - "5 \\nI’m not an immigrant, but my grandparents ... FAKE \n", - "6 Share This Baylee Luciani (left), Screenshot o... FAKE \n", - "7 A Czech stockbroker who saved more than 650 Je... REAL \n", - "8 Hillary Clinton and Donald Trump made some ina... REAL \n", - "9 Iranian negotiators reportedly have made a las... REAL " + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Reading the data\n", - "df=pd.read_csv('/home/femme_js/Hoaxify/news.csv')\n", - "\n", - "df.head(10)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Unnamed: 0 title text label\n", - "0 False False False False\n", - "1 False False False False\n", - "2 False False False False\n", - "3 False False False False\n", - "4 False False False False\n", - "... ... ... ... ...\n", - "6330 False False False False\n", - "6331 False False False False\n", - "6332 False False False False\n", - "6333 False False False False\n", - "6334 False False False False\n", - "\n", - "[6335 rows x 4 columns]\n" - ] - } - ], - "source": [ - "# checking if column have nan values\n", - "\n", - "check_nan_in_df = df.isnull()\n", - "print (check_nan_in_df)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "# as data dont have any NaN value, we dont need to fill them" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 FAKE\n", - "1 FAKE\n", - "2 REAL\n", - "3 FAKE\n", - "4 REAL\n", - "Name: label, dtype: object" + }, + { + "cell_type": "code", + "metadata": { + "id": "MdAVzqfOwGmS" + }, + "source": [ + "#INCLUDE LIBRARIES\n", + "\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import itertools\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "from nltk import sent_tokenize, word_tokenize\n", + "\n" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nnRJ1ZhtwGma", + "outputId": "7455aa5e-39f4-4143-e8d7-795b3a586630" + }, + "source": [ + "from nltk.tokenize import word_tokenize\n", + "text = \"Hello everyone. You are reading NLP article.\"\n", + "word_tokenize(text)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['Hello', 'everyone', '.', 'You', 'are', 'reading', 'NLP', 'article', '.']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 54 + } ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Getting the Labels\n", - "\n", - "labels=df.label\n", - "labels.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0titletextlabeltotal
08476You Can Smell Hillary’s FearDaniel Greenfield, a Shillman Journalism Fello...FAKEYou Can Smell Hillary’s Fear Daniel Greenfield...
110294Watch The Exact Moment Paul Ryan Committed Pol...Google Pinterest Digg Linkedin Reddit Stumbleu...FAKEWatch The Exact Moment Paul Ryan Committed Pol...
23608Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Mon...REALKerry to go to Paris in gesture of sympathy U....
310142Bernie supporters on Twitter erupt in anger ag...— Kaydee King (@KaydeeKing) November 9, 2016 T...FAKEBernie supporters on Twitter erupt in anger ag...
4875The Battle of New York: Why This Primary MattersIt's primary day in New York and front-runners...REALThe Battle of New York: Why This Primary Matte...
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 title \\\n", - "0 8476 You Can Smell Hillary’s Fear \n", - "1 10294 Watch The Exact Moment Paul Ryan Committed Pol... \n", - "2 3608 Kerry to go to Paris in gesture of sympathy \n", - "3 10142 Bernie supporters on Twitter erupt in anger ag... \n", - "4 875 The Battle of New York: Why This Primary Matters \n", - "\n", - " text label \\\n", - "0 Daniel Greenfield, a Shillman Journalism Fello... FAKE \n", - "1 Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE \n", - "2 U.S. Secretary of State John F. Kerry said Mon... REAL \n", - "3 — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE \n", - "4 It's primary day in New York and front-runners... REAL \n", - "\n", - " total \n", - "0 You Can Smell Hillary’s Fear Daniel Greenfield... \n", - "1 Watch The Exact Moment Paul Ryan Committed Pol... \n", - "2 Kerry to go to Paris in gesture of sympathy U.... \n", - "3 Bernie supporters on Twitter erupt in anger ag... \n", - "4 The Battle of New York: Why This Primary Matte... " + }, + { + "cell_type": "code", + "metadata": { + "id": "9fVJMZXJwGmb", + "outputId": "fc2748d4-7bf4-4ad1-95cc-e459a9a071bc" + }, + "source": [ + "#Reading the data\n", + "df=pd.read_csv('/home/femme_js/Hoaxify/news.csv')\n", + "\n", + "df.head(10)\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0titletextlabel
08476You Can Smell Hillary’s FearDaniel Greenfield, a Shillman Journalism Fello...FAKE
110294Watch The Exact Moment Paul Ryan Committed Pol...Google Pinterest Digg Linkedin Reddit Stumbleu...FAKE
23608Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Mon...REAL
310142Bernie supporters on Twitter erupt in anger ag...— Kaydee King (@KaydeeKing) November 9, 2016 T...FAKE
4875The Battle of New York: Why This Primary MattersIt's primary day in New York and front-runners...REAL
56903Tehran, USA\\nI’m not an immigrant, but my grandparents ...FAKE
67341Girl Horrified At What She Watches Boyfriend D...Share This Baylee Luciani (left), Screenshot o...FAKE
795‘Britain’s Schindler’ Dies at 106A Czech stockbroker who saved more than 650 Je...REAL
84869Fact check: Trump and Clinton at the 'commande...Hillary Clinton and Donald Trump made some ina...REAL
92909Iran reportedly makes new push for uranium con...Iranian negotiators reportedly have made a las...REAL
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 title \\\n", + "0 8476 You Can Smell Hillary’s Fear \n", + "1 10294 Watch The Exact Moment Paul Ryan Committed Pol... \n", + "2 3608 Kerry to go to Paris in gesture of sympathy \n", + "3 10142 Bernie supporters on Twitter erupt in anger ag... \n", + "4 875 The Battle of New York: Why This Primary Matters \n", + "5 6903 Tehran, USA \n", + "6 7341 Girl Horrified At What She Watches Boyfriend D... \n", + "7 95 ‘Britain’s Schindler’ Dies at 106 \n", + "8 4869 Fact check: Trump and Clinton at the 'commande... \n", + "9 2909 Iran reportedly makes new push for uranium con... \n", + "\n", + " text label \n", + "0 Daniel Greenfield, a Shillman Journalism Fello... FAKE \n", + "1 Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE \n", + "2 U.S. Secretary of State John F. Kerry said Mon... REAL \n", + "3 — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE \n", + "4 It's primary day in New York and front-runners... REAL \n", + "5 \\nI’m not an immigrant, but my grandparents ... FAKE \n", + "6 Share This Baylee Luciani (left), Screenshot o... FAKE \n", + "7 A Czech stockbroker who saved more than 650 Je... REAL \n", + "8 Hillary Clinton and Donald Trump made some ina... REAL \n", + "9 Iranian negotiators reportedly have made a las... REAL " + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 55 + } ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Combining important features into a single feature\n", - "\n", - "df['total'] = df['title'] + ' ' + df['text']\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "#PRE-PROCESSING THE DATA\n" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0titletextlabeltotal
08476You Can Smell Hillary’s FearDaniel Greenfield, a Shillman Journalism Fello...FAKEsmell hillary fear daniel greenfield shillman...
110294Watch The Exact Moment Paul Ryan Committed Pol...Google Pinterest Digg Linkedin Reddit Stumbleu...FAKEwatch exact moment paul ryan committed politi...
23608Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Mon...REALkerry go paris gesture sympathy u secretary s...
310142Bernie supporters on Twitter erupt in anger ag...— Kaydee King (@KaydeeKing) November 9, 2016 T...FAKEbernie supporter twitter erupt anger dnc trie...
4875The Battle of New York: Why This Primary MattersIt's primary day in New York and front-runners...REALbattle new york primary matter primary day ne...
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 title \\\n", - "0 8476 You Can Smell Hillary’s Fear \n", - "1 10294 Watch The Exact Moment Paul Ryan Committed Pol... \n", - "2 3608 Kerry to go to Paris in gesture of sympathy \n", - "3 10142 Bernie supporters on Twitter erupt in anger ag... \n", - "4 875 The Battle of New York: Why This Primary Matters \n", - "\n", - " text label \\\n", - "0 Daniel Greenfield, a Shillman Journalism Fello... FAKE \n", - "1 Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE \n", - "2 U.S. Secretary of State John F. Kerry said Mon... REAL \n", - "3 — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE \n", - "4 It's primary day in New York and front-runners... REAL \n", - "\n", - " total \n", - "0 smell hillary fear daniel greenfield shillman... \n", - "1 watch exact moment paul ryan committed politi... \n", - "2 kerry go paris gesture sympathy u secretary s... \n", - "3 bernie supporter twitter erupt anger dnc trie... \n", - "4 battle new york primary matter primary day ne... " + }, + { + "cell_type": "code", + "metadata": { + "id": "kjysD29EwGmb", + "outputId": "eb6a9053-fe24-479f-b6cb-022e63dc75d8" + }, + "source": [ + "# checking if column have nan values\n", + "\n", + "check_nan_in_df = df.isnull()\n", + "print (check_nan_in_df)\n", + "\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " Unnamed: 0 title text label\n", + "0 False False False False\n", + "1 False False False False\n", + "2 False False False False\n", + "3 False False False False\n", + "4 False False False False\n", + "... ... ... ... ...\n", + "6330 False False False False\n", + "6331 False False False False\n", + "6332 False False False False\n", + "6333 False False False False\n", + "6334 False False False False\n", + "\n", + "[6335 rows x 4 columns]\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "stop_words = stopwords.words('english')\n", - "\n", - "lemmatizer = WordNetLemmatizer()\n", - "\n", - "for index, row in df.iterrows():\n", - " filter_sentence = ''\n", - " sentence = row['total']\n", - " # Cleaning the sentence with regex\n", - " sentence = re.sub(r'[^\\w\\s]', '', sentence)\n", - " # Tokenization\n", - " words = nltk.word_tokenize(sentence)\n", - " # Stopwords removal\n", - " words = [w for w in words if not w in stop_words]\n", - " # Lemmatization\n", - " for words in words:\n", - " filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(words)).lower()\n", - " \n", - " df.loc[index, 'total'] = filter_sentence\n", - "\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 smell hillary fear daniel greenfield shillman...\n", - "1 watch exact moment paul ryan committed politi...\n", - "2 kerry go paris gesture sympathy u secretary s...\n", - "3 bernie supporter twitter erupt anger dnc trie...\n", - "4 battle new york primary matter primary day ne...\n", - "Name: total, dtype: object" + }, + { + "cell_type": "code", + "metadata": { + "id": "1ky9amFFwGmc" + }, + "source": [ + "# as data dont have any NaN value, we dont need to fill them" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tsfWPHDAwGmc", + "outputId": "1081082b-ce15-4643-c9db-1e6278f237e5" + }, + "source": [ + "#Getting the Labels\n", + "\n", + "labels=df.label\n", + "labels.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 FAKE\n", + "1 FAKE\n", + "2 REAL\n", + "3 FAKE\n", + "4 REAL\n", + "Name: label, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 58 + } ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['total'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(type(df['label']))" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" + }, + { + "cell_type": "code", + "metadata": { + "id": "-X_9viQzwGmd", + "outputId": "51e712a5-4bcb-4420-b28f-29bde46708be" + }, + "source": [ + "# Combining important features into a single feature\n", + "\n", + "df['total'] = df['title'] + ' ' + df['text']\n", + "\n", + "df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0titletextlabeltotal
08476You Can Smell Hillary’s FearDaniel Greenfield, a Shillman Journalism Fello...FAKEYou Can Smell Hillary’s Fear Daniel Greenfield...
110294Watch The Exact Moment Paul Ryan Committed Pol...Google Pinterest Digg Linkedin Reddit Stumbleu...FAKEWatch The Exact Moment Paul Ryan Committed Pol...
23608Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Mon...REALKerry to go to Paris in gesture of sympathy U....
310142Bernie supporters on Twitter erupt in anger ag...— Kaydee King (@KaydeeKing) November 9, 2016 T...FAKEBernie supporters on Twitter erupt in anger ag...
4875The Battle of New York: Why This Primary MattersIt's primary day in New York and front-runners...REALThe Battle of New York: Why This Primary Matte...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 title \\\n", + "0 8476 You Can Smell Hillary’s Fear \n", + "1 10294 Watch The Exact Moment Paul Ryan Committed Pol... \n", + "2 3608 Kerry to go to Paris in gesture of sympathy \n", + "3 10142 Bernie supporters on Twitter erupt in anger ag... \n", + "4 875 The Battle of New York: Why This Primary Matters \n", + "\n", + " text label \\\n", + "0 Daniel Greenfield, a Shillman Journalism Fello... FAKE \n", + "1 Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE \n", + "2 U.S. Secretary of State John F. Kerry said Mon... REAL \n", + "3 — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE \n", + "4 It's primary day in New York and front-runners... REAL \n", + "\n", + " total \n", + "0 You Can Smell Hillary’s Fear Daniel Greenfield... \n", + "1 Watch The Exact Moment Paul Ryan Committed Pol... \n", + "2 Kerry to go to Paris in gesture of sympathy U.... \n", + "3 Bernie supporters on Twitter erupt in anger ag... \n", + "4 The Battle of New York: Why This Primary Matte... " + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 59 + } ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEICAYAAACzliQjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAQ5klEQVR4nO3df6zddX3H8efLAk5RB467hrXFFq1ZcJtAGmRjyVAmFExW0E3pMu0YW01WEs38B40JTiXRZMrioiw4OotzYjc1VNcNK8OpyxQKQ6Qwxh0/RpsC1SLqiGzge3+cT/VQ76+2t+fg/Twfycn9ft/fz/d73t/k9nW//ZzvOSdVhSSpD88adwOSpNEx9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOnLEuBuYyXHHHVfLly8fdxuS9FPllltu+VZVTUy17Rkd+suXL2f79u3jbkOSfqokeWC6bU7vSFJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjryjH5z1k+L5Zf+w7hbWFDuf99rxt2CtGAZ+tIC50XJ/FkIFyRO70hSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOzBr6SX4myU1JvpFkR5I/bfUVSb6eZDLJp5Ic1erPbuuTbfvyoWO9vdXvTnLO4TopSdLU5nKl/wTwqqp6OXAysDrJ6cD7gSuq6iXAo8DFbfzFwKOtfkUbR5KTgAuBlwGrgY8kWTSfJyNJmtmsoV8D32+rR7ZHAa8C/r7VNwHnt+U1bZ22/awkafVrq+qJqroPmAROm5ezkCTNyZzm9JMsSnIb8AiwDfgv4DtV9WQbshNY0paXAA8CtO2PAT83XJ9in+HnWp9ke5Lte/bsOfAzkiRNa06hX1VPVdXJwFIGV+e/eLgaqqqrqmpVVa2amJjyy9wlSQfpgO7eqarvADcCvwock2TfZ/csBXa15V3AMoC2/WeBbw/Xp9hHkjQCc7l7ZyLJMW35OcCrgbsYhP9vt2HrgOva8pa2Ttv+z1VVrX5hu7tnBbASuGm+TkSSNLu5fMrm8cCmdqfNs4DNVfX5JHcC1yZ5L/DvwNVt/NXAx5NMAnsZ3LFDVe1Ishm4E3gS2FBVT83v6UiSZjJr6FfV7cApU9TvZYq7b6rqB8DvTHOsy4HLD7xNSdJ88B25ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHVk1tBPsizJjUnuTLIjyVta/V1JdiW5rT3OG9rn7Ukmk9yd5Jyh+upWm0xy6eE5JUnSdI6Yw5gngbdV1a1Jng/ckmRb23ZFVf3Z8OAkJwEXAi8DfgH4YpKXts0fBl4N7ARuTrKlqu6cjxORJM1u1tCvqt3A7rb8vSR3AUtm2GUNcG1VPQHcl2QSOK1tm6yqewGSXNvGGvqSNCIHNKefZDlwCvD1Vrokye1JNiY5ttWWAA8O7baz1aarS5JGZM6hn+R5wKeBt1bVd4ErgRcDJzP4n8AH5qOhJOuTbE+yfc+ePfNxSElSM6fQT3Ikg8D/RFV9BqCqHq6qp6rqh8BH+fEUzi5g2dDuS1ttuvrTVNVVVbWqqlZNTEwc6PlIkmYwl7t3AlwN3FVVHxyqHz807ALgjra8BbgwybOTrABWAjcBNwMrk6xIchSDF3u3zM9pSJLmYi5375wBvBH4ZpLbWu0dwNokJwMF3A+8GaCqdiTZzOAF2ieBDVX1FECSS4DrgUXAxqraMY/nIkmaxVzu3vkqkCk2bZ1hn8uBy6eob51pP0nS4eU7ciWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR2ZNfSTLEtyY5I7k+xI8pZWf2GSbUnuaT+PbfUk+VCSySS3Jzl16Fjr2vh7kqw7fKclSZrKXK70nwTeVlUnAacDG5KcBFwK3FBVK4Eb2jrAucDK9lgPXAmDPxLAZcArgNOAy/b9oZAkjcasoV9Vu6vq1rb8PeAuYAmwBtjUhm0Czm/La4BrauBrwDFJjgfOAbZV1d6qehTYBqye17ORJM3ogOb0kywHTgG+Diyuqt1t00PA4ra8BHhwaLedrTZdXZI0InMO/STPAz4NvLWqvju8raoKqPloKMn6JNuTbN+zZ898HFKS1Mwp9JMcySDwP1FVn2nlh9u0De3nI62+C1g2tPvSVpuu/jRVdVVVraqqVRMTEwdyLpKkWczl7p0AVwN3VdUHhzZtAfbdgbMOuG6o/qZ2F8/pwGNtGuh64Owkx7YXcM9uNUnSiBwxhzFnAG8EvpnktlZ7B/A+YHOSi4EHgNe3bVuB84BJ4HHgIoCq2pvkPcDNbdy7q2rvvJyFJGlOZg39qvoqkGk2nzXF+AI2THOsjcDGA2lQkjR/fEeuJHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI7OGfpKNSR5JcsdQ7V1JdiW5rT3OG9r29iSTSe5Ocs5QfXWrTSa5dP5PRZI0m7lc6X8MWD1F/YqqOrk9tgIkOQm4EHhZ2+cjSRYlWQR8GDgXOAlY28ZKkkboiNkGVNWXkyyf4/HWANdW1RPAfUkmgdPatsmquhcgybVt7J0H3LEk6aAdypz+JUlub9M/x7baEuDBoTE7W226+k9Isj7J9iTb9+zZcwjtSZL2d7ChfyXwYuBkYDfwgflqqKquqqpVVbVqYmJivg4rSWIO0ztTqaqH9y0n+Sjw+ba6C1g2NHRpqzFDXZI0Igd1pZ/k+KHVC4B9d/ZsAS5M8uwkK4CVwE3AzcDKJCuSHMXgxd4tB9+2JOlgzHqln+STwJnAcUl2ApcBZyY5GSjgfuDNAFW1I8lmBi/QPglsqKqn2nEuAa4HFgEbq2rHvJ+NJGlGc7l7Z+0U5atnGH85cPkU9a3A1gPqTpI0r3xHriR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdmTX0k2xM8kiSO4ZqL0yyLck97eexrZ4kH0oymeT2JKcO7bOujb8nybrDczqSpJnM5Ur/Y8Dq/WqXAjdU1UrghrYOcC6wsj3WA1fC4I8EcBnwCuA04LJ9fygkSaMza+hX1ZeBvfuV1wCb2vIm4Pyh+jU18DXgmCTHA+cA26pqb1U9CmzjJ/+QSJIOs4Od019cVbvb8kPA4ra8BHhwaNzOVpuuLkkaoUN+IbeqCqh56AWAJOuTbE+yfc+ePfN1WEkSBx/6D7dpG9rPR1p9F7BsaNzSVpuu/hOq6qqqWlVVqyYmJg6yPUnSVA429LcA++7AWQdcN1R/U7uL53TgsTYNdD1wdpJj2wu4Z7eaJGmEjphtQJJPAmcCxyXZyeAunPcBm5NcDDwAvL4N3wqcB0wCjwMXAVTV3iTvAW5u495dVfu/OCxJOsxmDf2qWjvNprOmGFvAhmmOsxHYeEDdSZLmle/IlaSOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdeSQQj/J/Um+meS2JNtb7YVJtiW5p/08ttWT5ENJJpPcnuTU+TgBSdLczceV/iur6uSqWtXWLwVuqKqVwA1tHeBcYGV7rAeunIfnliQdgMMxvbMG2NSWNwHnD9WvqYGvAcckOf4wPL8kaRqHGvoFfCHJLUnWt9riqtrdlh8CFrflJcCDQ/vubDVJ0ogccYj7/3pV7Ury88C2JP8xvLGqKkkdyAHbH4/1ACeccMIhtidJGnZIV/pVtav9fAT4LHAa8PC+aZv285E2fBewbGj3pa22/zGvqqpVVbVqYmLiUNqTJO3noEM/ydFJnr9vGTgbuAPYAqxrw9YB17XlLcCb2l08pwOPDU0DSZJG4FCmdxYDn02y7zh/W1X/lORmYHOSi4EHgNe38VuB84BJ4HHgokN4bknSQTjo0K+qe4GXT1H/NnDWFPUCNhzs80mSDp3vyJWkjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHVk5KGfZHWSu5NMJrl01M8vST0baegnWQR8GDgXOAlYm+SkUfYgST0b9ZX+acBkVd1bVf8LXAusGXEPktStI0b8fEuAB4fWdwKvGB6QZD2wvq1+P8ndI+qtB8cB3xp3E7PJ+8fdgcbkGf/7+VP0u/mi6TaMOvRnVVVXAVeNu4+FKMn2qlo17j6kqfj7ORqjnt7ZBSwbWl/aapKkERh16N8MrEyyIslRwIXAlhH3IEndGun0TlU9meQS4HpgEbCxqnaMsofOOW2mZzJ/P0cgVTXuHiRJI+I7ciWpI4a+JHXE0Jekjhj6HUry1nH3IGk8DP0+/cm4G1DfkmweWn7/ftu+MPqO+mHo9ynjbkDdWzm0/Or9tk2MspHeGPp98j5djdtMv4P+fh5Gz7jP3tH8SPI9pv7HE+C5I25H2t9zk5zC4MLzOW057fGcsXa2wPnmLEkjl+RLzHBFX1WvHF03fTH0O5LkaOACYG1VvWbc/ahfSY6sqv+bZtuKqrpv1D31wjn9BS7JUUkuSPJ3wG7gLOAvx9yWdF370MWnSfIrwI1j6Kcbhv4CleTsJH8N3Ae8DrgG2FtVF1XV58bbncStwD8m+dHrS0nOBLYCfzSupnrg9M4CleSHwFeA39/3X+Uk91bViePtTBpI8k7gHAbfmX028OfAa6tq+1gbW+C8e2fhOpXB9xV8Mcm9DL6PeNF4W5J+rKrem+Rx4BYGd+28qqomx9zWgueVfgeS/BqwlsE0zzeAz7avpZTGIsnnGNy9E+AMYBJ4aN/2qvqtMbW24Bn6HUnyLOA3gTdU1cXj7kf9SvIbM22vqn8ZVS+9cXpngUrye1X1N235jKr616r6IfCFJC8dc3vq3HShnmQZg2lJQ/8w8e6dhWv4Q9X+Yr9tfzDKRqSZJJlI8sdJvgJ8CVg85pYWNK/0F65MszzVujRSSZ4PvBb4XeClwGeAFVW1dKyNdcDQX7hqmuWp1qVRewS4CXgn8NWqqiQXjLmnLvhC7gLVboWbZHBV/+K2TFs/saqOHldvUvsinwuBo4FPAp8Ctvk+ksPP0F+gkrxopu1V9cCoepGmk+REBuG/lsFn7F/G4Jbi/xxrYwuYod+Zdtvm2qr6xLh7Ub+SnFBV/71f7ZcYhP8bquol4+ls4TP0F6gkLwA2AEuALcA24BLgbcA3qmrNGNtT55LcWlWntuVPV9Xrxt1TL3whd+H6OPAo8G/AHwLvYDCff35V3TbOxiSefgeZ8/gjZOgvXCdW1S8DJPkrBh+rfEJV/WC8bUnAzHeX6TAy9BeuH31BRVU9lWSnga9nkJcn+S7t6xHbMm29quoF42ttYXNOf4FK8hTwP/tWGXzv6OP4j0rqmqEvSR3xs3ckqSOGviR1xNCXpI4Y+pLUEUNfkjry/9XeZk0OCEF+AAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" + "cell_type": "code", + "metadata": { + "id": "13Aex96vwGmd" + }, + "source": [ + "#PRE-PROCESSING THE DATA\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "91U0zkFhwGme", + "outputId": "92f54a8f-770c-42a7-9503-d5ffd46805c3" + }, + "source": [ + "\n", + "stop_words = stopwords.words('english')\n", + "\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "for index, row in df.iterrows():\n", + " filter_sentence = ''\n", + " sentence = row['total']\n", + " # Cleaning the sentence with regex\n", + " sentence = re.sub(r'[^\\w\\s]', '', sentence)\n", + " # Tokenization\n", + " words = nltk.word_tokenize(sentence)\n", + " # Stopwords removal\n", + " words = [w for w in words if not w in stop_words]\n", + " # Lemmatization\n", + " for words in words:\n", + " filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(words)).lower()\n", + " \n", + " df.loc[index, 'total'] = filter_sentence\n", + "\n", + "df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0titletextlabeltotal
08476You Can Smell Hillary’s FearDaniel Greenfield, a Shillman Journalism Fello...FAKEsmell hillary fear daniel greenfield shillman...
110294Watch The Exact Moment Paul Ryan Committed Pol...Google Pinterest Digg Linkedin Reddit Stumbleu...FAKEwatch exact moment paul ryan committed politi...
23608Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Mon...REALkerry go paris gesture sympathy u secretary s...
310142Bernie supporters on Twitter erupt in anger ag...— Kaydee King (@KaydeeKing) November 9, 2016 T...FAKEbernie supporter twitter erupt anger dnc trie...
4875The Battle of New York: Why This Primary MattersIt's primary day in New York and front-runners...REALbattle new york primary matter primary day ne...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 title \\\n", + "0 8476 You Can Smell Hillary’s Fear \n", + "1 10294 Watch The Exact Moment Paul Ryan Committed Pol... \n", + "2 3608 Kerry to go to Paris in gesture of sympathy \n", + "3 10142 Bernie supporters on Twitter erupt in anger ag... \n", + "4 875 The Battle of New York: Why This Primary Matters \n", + "\n", + " text label \\\n", + "0 Daniel Greenfield, a Shillman Journalism Fello... FAKE \n", + "1 Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE \n", + "2 U.S. Secretary of State John F. Kerry said Mon... REAL \n", + "3 — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE \n", + "4 It's primary day in New York and front-runners... REAL \n", + "\n", + " total \n", + "0 smell hillary fear daniel greenfield shillman... \n", + "1 watch exact moment paul ryan committed politi... \n", + "2 kerry go paris gesture sympathy u secretary s... \n", + "3 bernie supporter twitter erupt anger dnc trie... \n", + "4 battle new york primary matter primary day ne... " + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 62 + } ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "df['label'].value_counts().plot(kind = 'bar')" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['FAKE', 'REAL'], dtype=object)" + }, + { + "cell_type": "code", + "metadata": { + "id": "Rz6usQf9wGme", + "outputId": "f33bf608-11f4-41a7-95c6-be05c5129060" + }, + "source": [ + "df['total'].head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 smell hillary fear daniel greenfield shillman...\n", + "1 watch exact moment paul ryan committed politi...\n", + "2 kerry go paris gesture sympathy u secretary s...\n", + "3 bernie supporter twitter erupt anger dnc trie...\n", + "4 battle new york primary matter primary day ne...\n", + "Name: total, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 63 + } ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.label = df.label.astype(str)\n", - "df.label.unique()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 0\n", - "2 1\n", - "3 0\n", - "4 1\n", - "Name: label, dtype: object" + }, + { + "cell_type": "code", + "metadata": { + "id": "M1nwNConwGmf", + "outputId": "4b6c4a29-7190-4b58-b1c9-64b954387a6f" + }, + "source": [ + "print(type(df['label']))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.label = df.label.astype(str)\n", - "df.label = df.label.str.strip()\n", - "\n", - "\n", - "dict = { 'REAL' : '1' , 'FAKE' : '0'}\n", - "\n", - "df['label'] = df['label'].map(dict)\n", - "\n", - "df['label'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "x_df = df['total']\n", - "y_df = df['label']" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 smell hillary fear daniel greenfield shillman...\n", - "1 watch exact moment paul ryan committed politi...\n", - "2 kerry go paris gesture sympathy u secretary s...\n", - "3 bernie supporter twitter erupt anger dnc trie...\n", - "4 battle new york primary matter primary day ne...\n", - "Name: total, dtype: object" + }, + { + "cell_type": "code", + "metadata": { + "id": "2Ox8Ef7vwGmf", + "outputId": "27f32879-7757-454f-ca6a-d74fe970ce16" + }, + "source": [ + "df['label'].value_counts().plot(kind = 'bar')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 65 + }, + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEICAYAAACzliQjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAQ5klEQVR4nO3df6zddX3H8efLAk5RB467hrXFFq1ZcJtAGmRjyVAmFExW0E3pMu0YW01WEs38B40JTiXRZMrioiw4OotzYjc1VNcNK8OpyxQKQ6Qwxh0/RpsC1SLqiGzge3+cT/VQ76+2t+fg/Twfycn9ft/fz/d73t/k9nW//ZzvOSdVhSSpD88adwOSpNEx9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOnLEuBuYyXHHHVfLly8fdxuS9FPllltu+VZVTUy17Rkd+suXL2f79u3jbkOSfqokeWC6bU7vSFJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjryjH5z1k+L5Zf+w7hbWFDuf99rxt2CtGAZ+tIC50XJ/FkIFyRO70hSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOzBr6SX4myU1JvpFkR5I/bfUVSb6eZDLJp5Ic1erPbuuTbfvyoWO9vdXvTnLO4TopSdLU5nKl/wTwqqp6OXAysDrJ6cD7gSuq6iXAo8DFbfzFwKOtfkUbR5KTgAuBlwGrgY8kWTSfJyNJmtmsoV8D32+rR7ZHAa8C/r7VNwHnt+U1bZ22/awkafVrq+qJqroPmAROm5ezkCTNyZzm9JMsSnIb8AiwDfgv4DtV9WQbshNY0paXAA8CtO2PAT83XJ9in+HnWp9ke5Lte/bsOfAzkiRNa06hX1VPVdXJwFIGV+e/eLgaqqqrqmpVVa2amJjyy9wlSQfpgO7eqarvADcCvwock2TfZ/csBXa15V3AMoC2/WeBbw/Xp9hHkjQCc7l7ZyLJMW35OcCrgbsYhP9vt2HrgOva8pa2Ttv+z1VVrX5hu7tnBbASuGm+TkSSNLu5fMrm8cCmdqfNs4DNVfX5JHcC1yZ5L/DvwNVt/NXAx5NMAnsZ3LFDVe1Ishm4E3gS2FBVT83v6UiSZjJr6FfV7cApU9TvZYq7b6rqB8DvTHOsy4HLD7xNSdJ88B25ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHVk1tBPsizJjUnuTLIjyVta/V1JdiW5rT3OG9rn7Ukmk9yd5Jyh+upWm0xy6eE5JUnSdI6Yw5gngbdV1a1Jng/ckmRb23ZFVf3Z8OAkJwEXAi8DfgH4YpKXts0fBl4N7ARuTrKlqu6cjxORJM1u1tCvqt3A7rb8vSR3AUtm2GUNcG1VPQHcl2QSOK1tm6yqewGSXNvGGvqSNCIHNKefZDlwCvD1Vrokye1JNiY5ttWWAA8O7baz1aarS5JGZM6hn+R5wKeBt1bVd4ErgRcDJzP4n8AH5qOhJOuTbE+yfc+ePfNxSElSM6fQT3Ikg8D/RFV9BqCqHq6qp6rqh8BH+fEUzi5g2dDuS1ttuvrTVNVVVbWqqlZNTEwc6PlIkmYwl7t3AlwN3FVVHxyqHz807ALgjra8BbgwybOTrABWAjcBNwMrk6xIchSDF3u3zM9pSJLmYi5375wBvBH4ZpLbWu0dwNokJwMF3A+8GaCqdiTZzOAF2ieBDVX1FECSS4DrgUXAxqraMY/nIkmaxVzu3vkqkCk2bZ1hn8uBy6eob51pP0nS4eU7ciWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR2ZNfSTLEtyY5I7k+xI8pZWf2GSbUnuaT+PbfUk+VCSySS3Jzl16Fjr2vh7kqw7fKclSZrKXK70nwTeVlUnAacDG5KcBFwK3FBVK4Eb2jrAucDK9lgPXAmDPxLAZcArgNOAy/b9oZAkjcasoV9Vu6vq1rb8PeAuYAmwBtjUhm0Czm/La4BrauBrwDFJjgfOAbZV1d6qehTYBqye17ORJM3ogOb0kywHTgG+Diyuqt1t00PA4ra8BHhwaLedrTZdXZI0InMO/STPAz4NvLWqvju8raoKqPloKMn6JNuTbN+zZ898HFKS1Mwp9JMcySDwP1FVn2nlh9u0De3nI62+C1g2tPvSVpuu/jRVdVVVraqqVRMTEwdyLpKkWczl7p0AVwN3VdUHhzZtAfbdgbMOuG6o/qZ2F8/pwGNtGuh64Owkx7YXcM9uNUnSiBwxhzFnAG8EvpnktlZ7B/A+YHOSi4EHgNe3bVuB84BJ4HHgIoCq2pvkPcDNbdy7q2rvvJyFJGlOZg39qvoqkGk2nzXF+AI2THOsjcDGA2lQkjR/fEeuJHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI7OGfpKNSR5JcsdQ7V1JdiW5rT3OG9r29iSTSe5Ocs5QfXWrTSa5dP5PRZI0m7lc6X8MWD1F/YqqOrk9tgIkOQm4EHhZ2+cjSRYlWQR8GDgXOAlY28ZKkkboiNkGVNWXkyyf4/HWANdW1RPAfUkmgdPatsmquhcgybVt7J0H3LEk6aAdypz+JUlub9M/x7baEuDBoTE7W226+k9Isj7J9iTb9+zZcwjtSZL2d7ChfyXwYuBkYDfwgflqqKquqqpVVbVqYmJivg4rSWIO0ztTqaqH9y0n+Sjw+ba6C1g2NHRpqzFDXZI0Igd1pZ/k+KHVC4B9d/ZsAS5M8uwkK4CVwE3AzcDKJCuSHMXgxd4tB9+2JOlgzHqln+STwJnAcUl2ApcBZyY5GSjgfuDNAFW1I8lmBi/QPglsqKqn2nEuAa4HFgEbq2rHvJ+NJGlGc7l7Z+0U5atnGH85cPkU9a3A1gPqTpI0r3xHriR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdmTX0k2xM8kiSO4ZqL0yyLck97eexrZ4kH0oymeT2JKcO7bOujb8nybrDczqSpJnM5Ur/Y8Dq/WqXAjdU1UrghrYOcC6wsj3WA1fC4I8EcBnwCuA04LJ9fygkSaMza+hX1ZeBvfuV1wCb2vIm4Pyh+jU18DXgmCTHA+cA26pqb1U9CmzjJ/+QSJIOs4Od019cVbvb8kPA4ra8BHhwaNzOVpuuLkkaoUN+IbeqCqh56AWAJOuTbE+yfc+ePfN1WEkSBx/6D7dpG9rPR1p9F7BsaNzSVpuu/hOq6qqqWlVVqyYmJg6yPUnSVA429LcA++7AWQdcN1R/U7uL53TgsTYNdD1wdpJj2wu4Z7eaJGmEjphtQJJPAmcCxyXZyeAunPcBm5NcDDwAvL4N3wqcB0wCjwMXAVTV3iTvAW5u495dVfu/OCxJOsxmDf2qWjvNprOmGFvAhmmOsxHYeEDdSZLmle/IlaSOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdeSQQj/J/Um+meS2JNtb7YVJtiW5p/08ttWT5ENJJpPcnuTU+TgBSdLczceV/iur6uSqWtXWLwVuqKqVwA1tHeBcYGV7rAeunIfnliQdgMMxvbMG2NSWNwHnD9WvqYGvAcckOf4wPL8kaRqHGvoFfCHJLUnWt9riqtrdlh8CFrflJcCDQ/vubDVJ0ogccYj7/3pV7Ury88C2JP8xvLGqKkkdyAHbH4/1ACeccMIhtidJGnZIV/pVtav9fAT4LHAa8PC+aZv285E2fBewbGj3pa22/zGvqqpVVbVqYmLiUNqTJO3noEM/ydFJnr9vGTgbuAPYAqxrw9YB17XlLcCb2l08pwOPDU0DSZJG4FCmdxYDn02y7zh/W1X/lORmYHOSi4EHgNe38VuB84BJ4HHgokN4bknSQTjo0K+qe4GXT1H/NnDWFPUCNhzs80mSDp3vyJWkjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHVk5KGfZHWSu5NMJrl01M8vST0baegnWQR8GDgXOAlYm+SkUfYgST0b9ZX+acBkVd1bVf8LXAusGXEPktStI0b8fEuAB4fWdwKvGB6QZD2wvq1+P8ndI+qtB8cB3xp3E7PJ+8fdgcbkGf/7+VP0u/mi6TaMOvRnVVVXAVeNu4+FKMn2qlo17j6kqfj7ORqjnt7ZBSwbWl/aapKkERh16N8MrEyyIslRwIXAlhH3IEndGun0TlU9meQS4HpgEbCxqnaMsofOOW2mZzJ/P0cgVTXuHiRJI+I7ciWpI4a+JHXE0Jekjhj6HUry1nH3IGk8DP0+/cm4G1DfkmweWn7/ftu+MPqO+mHo9ynjbkDdWzm0/Or9tk2MspHeGPp98j5djdtMv4P+fh5Gz7jP3tH8SPI9pv7HE+C5I25H2t9zk5zC4MLzOW057fGcsXa2wPnmLEkjl+RLzHBFX1WvHF03fTH0O5LkaOACYG1VvWbc/ahfSY6sqv+bZtuKqrpv1D31wjn9BS7JUUkuSPJ3wG7gLOAvx9yWdF370MWnSfIrwI1j6Kcbhv4CleTsJH8N3Ae8DrgG2FtVF1XV58bbncStwD8m+dHrS0nOBLYCfzSupnrg9M4CleSHwFeA39/3X+Uk91bViePtTBpI8k7gHAbfmX028OfAa6tq+1gbW+C8e2fhOpXB9xV8Mcm9DL6PeNF4W5J+rKrem+Rx4BYGd+28qqomx9zWgueVfgeS/BqwlsE0zzeAz7avpZTGIsnnGNy9E+AMYBJ4aN/2qvqtMbW24Bn6HUnyLOA3gTdU1cXj7kf9SvIbM22vqn8ZVS+9cXpngUrye1X1N235jKr616r6IfCFJC8dc3vq3HShnmQZg2lJQ/8w8e6dhWv4Q9X+Yr9tfzDKRqSZJJlI8sdJvgJ8CVg85pYWNK/0F65MszzVujRSSZ4PvBb4XeClwGeAFVW1dKyNdcDQX7hqmuWp1qVRewS4CXgn8NWqqiQXjLmnLvhC7gLVboWbZHBV/+K2TFs/saqOHldvUvsinwuBo4FPAp8Ctvk+ksPP0F+gkrxopu1V9cCoepGmk+REBuG/lsFn7F/G4Jbi/xxrYwuYod+Zdtvm2qr6xLh7Ub+SnFBV/71f7ZcYhP8bquol4+ls4TP0F6gkLwA2AEuALcA24BLgbcA3qmrNGNtT55LcWlWntuVPV9Xrxt1TL3whd+H6OPAo8G/AHwLvYDCff35V3TbOxiSefgeZ8/gjZOgvXCdW1S8DJPkrBh+rfEJV/WC8bUnAzHeX6TAy9BeuH31BRVU9lWSnga9nkJcn+S7t6xHbMm29quoF42ttYXNOf4FK8hTwP/tWGXzv6OP4j0rqmqEvSR3xs3ckqSOGviR1xNCXpI4Y+pLUEUNfkjry/9XeZk0OCEF+AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0\n", - "1 0\n", - "2 1\n", - "3 0\n", - "4 1\n", - "Name: label, dtype: object" + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EWxNWtaLwGmg", + "outputId": "0d46397d-187a-46cc-ef85-b180bdbd93e1" + }, + "source": [ + "df.label = df.label.astype(str)\n", + "df.label.unique()\n" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['FAKE', 'REAL'], dtype=object)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "#VECOTRIZATION\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (0, 79944)\t0.029961665151307385\n", - " (0, 79749)\t0.018595474099453224\n", - " (0, 79395)\t0.036615467772519145\n", - " (0, 79376)\t0.02355747992988494\n", - " (0, 79332)\t0.03594384891128402\n", - " (0, 79289)\t0.02697568668358886\n", - " (0, 79282)\t0.019878140017837338\n", - " (0, 79278)\t0.00860342274360593\n", - " (0, 79115)\t0.01570600645390605\n", - " (0, 79014)\t0.01438969583128738\n", - " (0, 78914)\t0.027881406253180738\n", - " (0, 78913)\t0.016495809217904393\n", - " (0, 78845)\t0.038416858595659395\n", - " (0, 78761)\t0.024355075927542776\n", - " (0, 78610)\t0.034694878207868526\n", - " (0, 78517)\t0.03515710620069247\n", - " (0, 78488)\t0.0180895260083385\n", - " (0, 78342)\t0.013680144757713397\n", - " (0, 78300)\t0.018725162203103333\n", - " (0, 78293)\t0.0611263997401894\n", - " (0, 78034)\t0.01136718115233016\n", - " (0, 77986)\t0.030504187362211135\n", - " (0, 77868)\t0.04229651595781814\n", - " (0, 77761)\t0.032266387789282994\n", - " (0, 77687)\t0.023218154073912735\n", - " :\t:\n", - " (6334, 9116)\t0.034961446302379004\n", - " (6334, 9103)\t0.09675332285419716\n", - " (6334, 9098)\t0.0665954515387032\n", - " (6334, 9096)\t0.09367722226789081\n", - " (6334, 8524)\t0.017506370821624376\n", - " (6334, 8437)\t0.032934465710185316\n", - " (6334, 8392)\t0.03285946712960298\n", - " (6334, 8364)\t0.026421841129276452\n", - " (6334, 8160)\t0.05153619030502903\n", - " (6334, 8097)\t0.03878242545700508\n", - " (6334, 8071)\t0.025725569688909215\n", - " (6334, 8069)\t0.026437396261267423\n", - " (6334, 8064)\t0.05380688226235036\n", - " (6334, 7944)\t0.03420695909627367\n", - " (6334, 7280)\t0.030962779833488763\n", - " (6334, 7186)\t0.03791674404047929\n", - " (6334, 6735)\t0.02324823316786639\n", - " (6334, 6441)\t0.032711727539673545\n", - " (6334, 5861)\t0.02871980380697055\n", - " (6334, 5497)\t0.03232012030466833\n", - " (6334, 5037)\t0.016582915320627552\n", - " (6334, 3644)\t0.05681576883972211\n", - " (6334, 2203)\t0.026594785511428735\n", - " (6334, 2181)\t0.044954422402769124\n", - " (6334, 1838)\t0.01499847407259141\n" - ] - } - ], - "source": [ - "from sklearn.feature_extraction.text import TfidfTransformer\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "\n", - "\n", - "\n", - "count_vectorizer = CountVectorizer()\n", - "count_vectorizer.fit_transform(x_df)\n", - "freq_term_matrix = count_vectorizer.transform(x_df)\n", - "\n", - "tfidf = TfidfTransformer(norm = \"l2\")\n", - "tfidf.fit(freq_term_matrix)\n", - "tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)\n", - "\n", - "print(tf_idf_matrix)" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [], - "source": [ - "#Splitting data into train and test data\n", - "\n", - "x_train, x_test, y_train, y_test = train_test_split(tf_idf_matrix,\n", - " y_df, random_state=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [], - "source": [ - "#Implementing DIfferent Models and checking accuracy\n" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1957 1\n", - "5016 1\n", - "3166 1\n", - "811 0\n", - "3389 1\n", - "Name: label, dtype: object" + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B9fusoxAwGmg", + "outputId": "29e3e791-352a-4cfd-9c49-8a7e53dc93c3" + }, + "source": [ + "df.label = df.label.astype(str)\n", + "df.label = df.label.str.strip()\n", + "\n", + "\n", + "dict = { 'REAL' : '1' , 'FAKE' : '0'}\n", + "\n", + "df['label'] = df['label'].map(dict)\n", + "\n", + "df['label'].head()" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 1\n", + "3 0\n", + "4 1\n", + "Name: label, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kH9wAauYwGmh" + }, + "source": [ + "x_df = df['total']\n", + "y_df = df['label']" + ], + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DPxFPnLswGmh", + "outputId": "67a661c8-e272-46ef-b84c-3305cfc68cb7" + }, + "source": [ + "x_df.head()" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 you can smell hillarys fear daniel greenfield...\n", + "1 watch the exact moment paul ryan committed po...\n", + "2 kerry go paris gesture sympathy us secretary ...\n", + "3 bernie supporter twitter erupt anger dnc we t...\n", + "4 the battle new york why this primary matters ...\n", + "Name: total, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ujPxlDuCwGmh", + "outputId": "31030a2e-9192-41fe-f039-127354691f6d" + }, + "source": [ + "y_df.head()" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 0\n", + "1 0\n", + "2 1\n", + "3 0\n", + "4 1\n", + "Name: label, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HWKlp7ugwGmi" + }, + "source": [ + "#VECOTRIZATION\n" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B4Zz8i5FwGmi", + "outputId": "3b7d5998-f7c3-4ddb-9092-65a86ea8f007" + }, + "source": [ + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "\n", + "\n", + "count_vectorizer = CountVectorizer()\n", + "count_vectorizer.fit_transform(x_df)\n", + "freq_term_matrix = count_vectorizer.transform(x_df)\n", + "\n", + "tfidf = TfidfTransformer(norm = \"l2\")\n", + "tfidf.fit(freq_term_matrix)\n", + "tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)\n", + "\n", + "print(tf_idf_matrix)" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "text": [ + " (0, 84103)\t0.02880344134538735\n", + " (0, 84077)\t0.03002128664210717\n", + " (0, 83877)\t0.018607343328196482\n", + " (0, 83514)\t0.03653774133019167\n", + " (0, 83492)\t0.023707364893939736\n", + " (0, 83447)\t0.03586754816546842\n", + " (0, 83403)\t0.02695875045200971\n", + " (0, 83396)\t0.019835943175965133\n", + " (0, 83392)\t0.008585159602852022\n", + " (0, 83218)\t0.01572657851425174\n", + " (0, 83110)\t0.014470516380511316\n", + " (0, 83004)\t0.027822220268486773\n", + " (0, 83003)\t0.016460792307242865\n", + " (0, 82929)\t0.03833530820382529\n", + " (0, 82841)\t0.024379366941566442\n", + " (0, 82678)\t0.0346212287472953\n", + " (0, 82578)\t0.035082475533552405\n", + " (0, 82545)\t0.018051125993657446\n", + " (0, 82391)\t0.013651104872461956\n", + " (0, 82373)\t0.014279148105824169\n", + " (0, 82346)\t0.018685412875057213\n", + " (0, 82339)\t0.060996642075652405\n", + " (0, 82073)\t0.01138273361804669\n", + " (0, 82023)\t0.03043943380028792\n", + " (0, 81896)\t0.04220672992179989\n", + " :\t:\n", + " (6334, 9343)\t0.06663788710149954\n", + " (6334, 9341)\t0.09390119973816259\n", + " (6334, 9232)\t0.01929156802079928\n", + " (6334, 8729)\t0.01751752612849853\n", + " (6334, 8631)\t0.03314648822581055\n", + " (6334, 8584)\t0.032880405646402766\n", + " (6334, 8554)\t0.026438677499817233\n", + " (6334, 8334)\t0.05165463029655788\n", + " (6334, 8267)\t0.0388071381665498\n", + " (6334, 8241)\t0.02574196238544902\n", + " (6334, 8238)\t0.02645424254375812\n", + " (6334, 8232)\t0.05462537789001593\n", + " (6334, 8105)\t0.034228756253996344\n", + " (6334, 7424)\t0.030982509754340935\n", + " (6334, 7333)\t0.013646914039640313\n", + " (6334, 7322)\t0.03816721468972994\n", + " (6334, 6849)\t0.023263047270567164\n", + " (6334, 6547)\t0.03273257191471888\n", + " (6334, 5935)\t0.028738104471802604\n", + " (6334, 5540)\t0.03251624806904254\n", + " (6334, 5056)\t0.016593482188606904\n", + " (6334, 3644)\t0.05685197265050367\n", + " (6334, 2203)\t0.02661173208457374\n", + " (6334, 2181)\t0.04498306799598561\n", + " (6334, 1838)\t0.015008031312217405\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hi2MeXnjwGmi" + }, + "source": [ + "#Splitting data into train and test data\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(tf_idf_matrix,\n", + " y_df, random_state=0)" + ], + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "DlYutmVwwGmj" + }, + "source": [ + "#Implementing DIfferent Models and checking accuracy\n" + ], + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KNx0-RkmwGmj", + "outputId": "dbd1be4e-fc60-4f59-d8fb-7775cd87bccd" + }, + "source": [ + "y_train.head()" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1957 1\n", + "5016 1\n", + "3166 1\n", + "811 0\n", + "3389 1\n", + "Name: label, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "v_scpTcjwGmj", + "outputId": "570df87d-72e6-438a-ad92-d16e38f1f4f3" + }, + "source": [ + "#LOGISTIC REGRESSION\n", + "\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logreg = LogisticRegression()\n", + "logreg.fit(x_train, y_train)\n", + "Accuracy = logreg.score(x_test, y_test)\n", + "\n", + "print(Accuracy*100)" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "text": [ + "91.72979797979798\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wt4QZNMqwGmk", + "outputId": "80272a91-9a21-4d4b-8162-80aa6c00eb4c" + }, + "source": [ + "#NAIVE BAYES\n", + "\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "\n", + "NB = MultinomialNB()\n", + "NB.fit(x_train, y_train)\n", + "Accuracy = NB.score(x_test, y_test)\n", + "\n", + "print(Accuracy*100)" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "text": [ + "82.32323232323232\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KAOk0BpjwGmk", + "outputId": "d22f3aa3-f1ec-47f8-d7bc-ca4dca5b9f55" + }, + "source": [ + "# DECISION TREE\n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "\n", + "clf = DecisionTreeClassifier()\n", + "clf.fit(x_train, y_train)\n", + "Accuracy = clf.score(x_test, y_test)\n", + "\n", + "print(Accuracy*100)" + ], + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "text": [ + "80.93434343434343\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gsZTW5zewGmk", + "outputId": "b588f0bf-707b-4bb3-e2fe-7c75d0885fab" + }, + "source": [ + "# PASSIVE-AGGRESSIVE CLASSIFIER\n", + "\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from sklearn.linear_model import PassiveAggressiveClassifier\n", + "\n", + "#DataFlair - Initialize a PassiveAggressiveClassifier\n", + "pac=PassiveAggressiveClassifier(max_iter=50)\n", + "pac.fit(x_train,y_train)\n", + "#DataFlair - Predict on the test set and calculate accuracy\n", + "y_pred=pac.predict(x_test)\n", + "score=accuracy_score(y_test,y_pred)\n", + "\n", + "print(f'Accuracy: {round(score*100,2)}%')\n", + "\n", + "\n" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Accuracy: 94.07%\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J35xQeYuwGml", + "outputId": "e7e91e80-0435-4854-b3a0-af545ddffb9c" + }, + "source": [ + "# KNN CLASSIFIER\r\n", + " \r\n", + "from sklearn.neighbors import KNeighborsClassifier\r\n", + "\r\n", + "c2 = KNeighborsClassifier()\r\n", + "c2.fit(x_train, y_train)\r\n", + "Accuracy = c2.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "text": [ + "82.51262626262627\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yVNaLPwzwGml", + "outputId": "2aad115a-b15f-4ad0-8deb-a6a6de30e3af" + }, + "source": [ + "# SUPPORT VECTOR CLASSIFIER\r\n", + " \r\n", + "from sklearn.svm import LinearSVC\r\n", + "\r\n", + "c3 = LinearSVC()\r\n", + "c3.fit(x_train, y_train)\r\n", + "Accuracy = c3.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "text": [ + "93.81313131313132\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vt3RN2QVySua", + "outputId": "1d19a080-7090-4cd5-d4f5-4476737257aa" + }, + "source": [ + "# RANDOM FOREST CLASSIFIER\r\n", + "\r\n", + "from sklearn.ensemble import RandomForestClassifier\r\n", + "\r\n", + "c6 = RandomForestClassifier(n_estimators = 70, criterion = 'entropy', random_state = 0)\r\n", + "c6.fit(x_train, y_train)\r\n", + "Accuracy = c6.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "text": [ + "90.59343434343434\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RugwaZcgyuzD", + "outputId": "3f04fb8a-bfbc-44e7-dfb7-7a93e95d1eb8" + }, + "source": [ + "# ENSEMBLE METHOD - COMBINING ALL MODELS \r\n", + "from sklearn.ensemble import VotingClassifier\r\n", + "\r\n", + "ens1 = VotingClassifier( estimators= [('LR',logreg),('NB',NB),('DT',clf),('PAC',pac),('KNN',c2),('SVC',c3),('RF',c6)], voting = 'hard')\r\n", + "\r\n", + "\r\n", + "ens1.fit(x_train, y_train)\r\n", + "Accuracy = ens1.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "text": [ + "93.43434343434343\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GKae-kpx0pow" + }, + "source": [ + "No change in the accuracy at all. This just shows that the Naive Bayes, Decision tree and KNN classifier added very little to the accuracy of the ensemble model." + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KupDb0er0kr3", + "outputId": "b15f4f2c-0639-4eeb-a047-23e1085dc1bf" + }, + "source": [ + "# ENSEMBLE METHOD - COMBINING ALL MODELS THAT HAVE ACCURACY > 90% \r\n", + "\r\n", + "ens2 = VotingClassifier( estimators= [('LR',logreg),('PAC',pac),('SVC',c3),('RF',c6)], voting = 'hard')\r\n", + "\r\n", + "\r\n", + "ens2.fit(x_train, y_train)\r\n", + "Accuracy = ens2.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 39, + "outputs": [ + { + "output_type": "stream", + "text": [ + "93.49747474747475\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "doB9aOQY0AY0", + "outputId": "fa83c125-a838-45f6-f885-48ce42c31aad" + }, + "source": [ + "# ENSEMBLE METHOD - COMBINING ALL MODELS THAT HAVE ACCURACY > 91% \r\n", + "\r\n", + "ens3 = VotingClassifier( estimators= [('LR',logreg),('PAC',pac),('SVC',c3)], voting = 'hard')\r\n", + "\r\n", + "\r\n", + "ens3.fit(x_train, y_train)\r\n", + "Accuracy = ens3.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "text": [ + "93.81313131313132\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gtFKmdbp1SXo", + "outputId": "d73478d3-dc1c-44af-df3e-adb4e92fc1b6" + }, + "source": [ + "# ENSEMBLE METHOD - COMBINING top two models\r\n", + "\r\n", + "ens4 = VotingClassifier( estimators= [('PAC',pac),('SVC',c3)], voting = 'hard')\r\n", + "\r\n", + "ens4.fit(x_train, y_train)\r\n", + "Accuracy = ens4.score(x_test, y_test)\r\n", + "\r\n", + "print(Accuracy*100)" + ], + "execution_count": 41, + "outputs": [ + { + "output_type": "stream", + "text": [ + "93.87626262626263\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_train.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "91.16161616161617\n" - ] - } - ], - "source": [ - "#LOGISTIC REGRESSION\n", - "\n", - "\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "logreg = LogisticRegression()\n", - "logreg.fit(x_train, y_train)\n", - "Accuracy = logreg.score(x_test, y_test)\n", - "\n", - "print(Accuracy*100)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "82.32323232323232\n" - ] - } - ], - "source": [ - "#NAIVE BAYES\n", - "\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "\n", - "\n", - "NB = MultinomialNB()\n", - "NB.fit(x_train, y_train)\n", - "Accuracy = NB.score(x_test, y_test)\n", - "\n", - "print(Accuracy*100)" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "80.49242424242425\n" - ] - } - ], - "source": [ - "# DECISION TREE\n", - "\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "\n", - "\n", - "clf = DecisionTreeClassifier()\n", - "clf.fit(x_train, y_train)\n", - "Accuracy = clf.score(x_test, y_test)\n", - "\n", - "print(Accuracy*100)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 93.12%\n" - ] } - ], - "source": [ - "# PASSIVE-AGGRESSIVE CLASSIFIER\n", - "\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "from sklearn.linear_model import PassiveAggressiveClassifier\n", - "\n", - "#DataFlair - Initialize a PassiveAggressiveClassifier\n", - "pac=PassiveAggressiveClassifier(max_iter=50)\n", - "pac.fit(x_train,y_train)\n", - "#DataFlair - Predict on the test set and calculate accuracy\n", - "y_pred=pac.predict(x_test)\n", - "score=accuracy_score(y_test,y_pred)\n", - "\n", - "print(f'Accuracy: {round(score*100,2)}%')\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + ] +} \ No newline at end of file