4.Text Classification Using ML.ipynb
4.Text Classification Using ML.ipynb
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"[Data Link: ](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-
dataset-of-50k-movie-reviews)"
],
"metadata": {
"id": "_7_3WPQ-tjcF"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import re\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"nltk.download('stopwords')\n",
"import string\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from keras.models import Model\n",
"from keras.layers import LSTM, Activation, Dense, Dropout, Input,
Embedding, SpatialDropout1D\n",
"from keras.optimizers import RMSprop\n",
"from keras.preprocessing.text import Tokenizer\n",
"from keras.preprocessing import sequence\n",
"from keras.utils import to_categorical\n",
"from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
"from keras.models import Sequential\n",
"from keras.utils import pad_sequences\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.metrics import confusion_matrix\n",
"pd.set_option('display.max_rows', None)\n",
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('display.max_colwidth', 255)"
],
"metadata": {
"id": "8YwMBSlZR6x8"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pbGDrfins8su",
"outputId": "89771304-fa57-4d74-e760-b184b9c0561b"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/bin/bash: line 1: nvidia-smi: command not found\n"
]
}
],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd"
],
"metadata": {
"id": "k5UZB1YWtTVI"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount(\"/content/gdrive\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zljGgXtYuA6Y",
"outputId": "a145cf8b-399e-4c83-c085-b766d17fad3d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/gdrive\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"data_path =
\"/content/gdrive/MyDrive/005_BOKTIAR_AHMED_BAPPY/My_classes/DSM
1.0/ALL_NOTES/Day_03/data/IMDB Dataset.csv\""
],
"metadata": {
"id": "amfV2WspuTnf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(data_path)\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "U5NdlnEyu6Dq",
"outputId": "43001e16-430f-4e52-d70a-c4c35e549414"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. <br /><br />The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive"
],
"text/html": [
"\n",
" <div id=\"df-801e1195-0043-469a-ae6d-58076a2a22c9\" class=\"colab-
df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. <br /><br
/>The...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\
n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <div class=\"colab-df-buttons\">\n",
"\n",
" <div class=\"colab-df-container\">\n",
" <button class=\"colab-df-convert\"
onclick=\"convertToInteractive('df-801e1195-0043-469a-ae6d-58076a2a22c9')\"\n",
" title=\"Convert this dataframe to an interactive
table.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"
viewBox=\"0 -960 960 960\">\n",
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-
160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-
160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-
160H620v160Z\"/>\n",
" </svg>\n",
" </button>\n",
"\n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" .colab-df-buttons div {\n",
" margin-bottom: 4px;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-801e1195-0043-469a-ae6d-
58076a2a22c9 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-801e1195-0043-
469a-ae6d-58076a2a22c9');\n",
" const dataTable =\n",
" await
google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\
n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
"\n",
"\n",
"<div id=\"df-892051fa-0402-4bbd-b56c-85f813d5b58d\">\n",
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-
892051fa-0402-4bbd-b56c-85f813d5b58d')\"\n",
" title=\"Suggest charts.\"\n",
" style=\"display:none;\">\n",
"\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0
0 24 24\"\n",
" width=\"24px\">\n",
" <g>\n",
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0
2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
" </g>\n",
"</svg>\n",
" </button>\n",
"\n",
"<style>\n",
" .colab-df-quickchart {\n",
" --bg-color: #E8F0FE;\n",
" --fill-color: #1967D2;\n",
" --hover-bg-color: #E2EBFA;\n",
" --hover-fill-color: #174EA6;\n",
" --disabled-fill-color: #AAA;\n",
" --disabled-bg-color: #DDD;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-quickchart {\n",
" --bg-color: #3B4455;\n",
" --fill-color: #D2E3FC;\n",
" --hover-bg-color: #434B5C;\n",
" --hover-fill-color: #FFFFFF;\n",
" --disabled-bg-color: #3B4455;\n",
" --disabled-fill-color: #666;\n",
" }\n",
"\n",
" .colab-df-quickchart {\n",
" background-color: var(--bg-color);\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: var(--fill-color);\n",
" height: 32px;\n",
" padding: 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-quickchart:hover {\n",
" background-color: var(--hover-bg-color);\n",
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: var(--button-hover-fill-color);\n",
" }\n",
"\n",
" .colab-df-quickchart-complete:disabled,\n",
" .colab-df-quickchart-complete:disabled:hover {\n",
" background-color: var(--disabled-bg-color);\n",
" fill: var(--disabled-fill-color);\n",
" box-shadow: none;\n",
" }\n",
"\n",
" .colab-df-spinner {\n",
" border: 2px solid var(--fill-color);\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" animation:\n",
" spin 1s steps(1) infinite;\n",
" }\n",
"\n",
" @keyframes spin {\n",
" 0% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" border-left-color: var(--fill-color);\n",
" }\n",
" 20% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 30% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 40% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 60% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 80% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" 90% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" }\n",
"</style>\n",
"\n",
" <script>\n",
" async function quickchart(key) {\n",
" const quickchartButtonEl =\n",
" document.querySelector('#' + key + ' button');\n",
" quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n",
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
" try {\n",
" const charts = await google.colab.kernel.invokeFunction(\n",
" 'suggestCharts', [key], {});\n",
" } catch (error) {\n",
" console.error('Error during call to suggestCharts:',
error);\n",
" }\n",
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
" quickchartButtonEl.classList.add('colab-df-quickchart-
complete');\n",
" }\n",
" (() => {\n",
" let quickchartButtonEl =\n",
" document.querySelector('#df-892051fa-0402-4bbd-b56c-
85f813d5b58d button');\n",
" quickchartButtonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
" })();\n",
" </script>\n",
"</div>\n",
" </div>\n",
" </div>\n"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"df.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BNi-ZW6uu_lP",
"outputId": "991ff212-6d38-4d81-bd4c-55225237c6b9"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(50000, 2)"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"source": [
"df = df.iloc[:10000]\n",
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "eXFEZ2mbvDWa",
"outputId": "fb4fdc59-c61d-4ba0-a264-c74b31f4eedc"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. <br /><br />The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive"
],
"text/html": [
"\n",
" <div id=\"df-773e932b-aea5-494e-b8dd-9aeabae9c58c\" class=\"colab-
df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. <br /><br
/>The...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\
n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <div class=\"colab-df-buttons\">\n",
"\n",
" <div class=\"colab-df-container\">\n",
" <button class=\"colab-df-convert\"
onclick=\"convertToInteractive('df-773e932b-aea5-494e-b8dd-9aeabae9c58c')\"\n",
" title=\"Convert this dataframe to an interactive
table.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"
viewBox=\"0 -960 960 960\">\n",
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-
160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-
160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-
160H620v160Z\"/>\n",
" </svg>\n",
" </button>\n",
"\n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" .colab-df-buttons div {\n",
" margin-bottom: 4px;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-773e932b-aea5-494e-b8dd-
9aeabae9c58c button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-773e932b-aea5-
494e-b8dd-9aeabae9c58c');\n",
" const dataTable =\n",
" await
google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\
n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
"\n",
"\n",
"<div id=\"df-5077cce7-314a-4013-99bf-6c05efb73905\">\n",
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-
5077cce7-314a-4013-99bf-6c05efb73905')\"\n",
" title=\"Suggest charts.\"\n",
" style=\"display:none;\">\n",
"\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0
0 24 24\"\n",
" width=\"24px\">\n",
" <g>\n",
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0
2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
" </g>\n",
"</svg>\n",
" </button>\n",
"\n",
"<style>\n",
" .colab-df-quickchart {\n",
" --bg-color: #E8F0FE;\n",
" --fill-color: #1967D2;\n",
" --hover-bg-color: #E2EBFA;\n",
" --hover-fill-color: #174EA6;\n",
" --disabled-fill-color: #AAA;\n",
" --disabled-bg-color: #DDD;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-quickchart {\n",
" --bg-color: #3B4455;\n",
" --fill-color: #D2E3FC;\n",
" --hover-bg-color: #434B5C;\n",
" --hover-fill-color: #FFFFFF;\n",
" --disabled-bg-color: #3B4455;\n",
" --disabled-fill-color: #666;\n",
" }\n",
"\n",
" .colab-df-quickchart {\n",
" background-color: var(--bg-color);\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: var(--fill-color);\n",
" height: 32px;\n",
" padding: 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-quickchart:hover {\n",
" background-color: var(--hover-bg-color);\n",
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: var(--button-hover-fill-color);\n",
" }\n",
"\n",
" .colab-df-quickchart-complete:disabled,\n",
" .colab-df-quickchart-complete:disabled:hover {\n",
" background-color: var(--disabled-bg-color);\n",
" fill: var(--disabled-fill-color);\n",
" box-shadow: none;\n",
" }\n",
"\n",
" .colab-df-spinner {\n",
" border: 2px solid var(--fill-color);\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" animation:\n",
" spin 1s steps(1) infinite;\n",
" }\n",
"\n",
" @keyframes spin {\n",
" 0% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" border-left-color: var(--fill-color);\n",
" }\n",
" 20% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 30% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 40% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 60% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 80% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" 90% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" }\n",
"</style>\n",
"\n",
" <script>\n",
" async function quickchart(key) {\n",
" const quickchartButtonEl =\n",
" document.querySelector('#' + key + ' button');\n",
" quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n",
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
" try {\n",
" const charts = await google.colab.kernel.invokeFunction(\n",
" 'suggestCharts', [key], {});\n",
" } catch (error) {\n",
" console.error('Error during call to suggestCharts:',
error);\n",
" }\n",
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
" quickchartButtonEl.classList.add('colab-df-quickchart-
complete');\n",
" }\n",
" (() => {\n",
" let quickchartButtonEl =\n",
" document.querySelector('#df-5077cce7-314a-4013-99bf-
6c05efb73905 button');\n",
" quickchartButtonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
" })();\n",
" </script>\n",
"</div>\n",
" </div>\n",
" </div>\n"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"df.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kjQNaHmZvNsu",
"outputId": "8080aa5f-dc4b-43d0-924c-ba696a1419b7"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(10000, 2)"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"df['review'][0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 174
},
"id": "wfnzaafjvPg-",
"outputId": "74113617-b272-4c03-fefc-70f53cfa549a"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\"One of the other reviewers has mentioned that after watching just
1 Oz episode you'll be hooked. They are right, as this is exactly what happened
with me.<br /><br />The first thing that struck me about Oz was its brutality and
unflinching scenes of violence, which set in right from the word GO. Trust me, this
is not a show for the faint hearted or timid. This show pulls no punches with
regards to drugs, sex or violence. Its is hardcore, in the classic use of the
word.<br /><br />It is called OZ as that is the nickname given to the Oswald
Maximum Security State Penitentary. It focuses mainly on Emerald City, an
experimental section of the prison where all the cells have glass fronts and face
inwards, so privacy is not high on the agenda. Em City is home to many..Aryans,
Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles,
death stares, dodgy dealings and shady agreements are never far away.<br /><br />I
would say the main appeal of the show is due to the fact that it goes where other
shows wouldn't dare. Forget pretty pictures painted for mainstream audiences,
forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw
struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I
watched more, I developed a taste for Oz, and got accustomed to the high levels of
graphic violence. Not just violence, but injustice (crooked guards who'll be sold
out for a nickel, inmates who'll kill on order and get away with it, well mannered,
middle class inmates being turned into prison bitches due to their lack of street
skills or prison experience) Watching Oz, you may become comfortable with what is
uncomfortable viewing....thats if you can get in touch with your darker side.\""
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"source": [
"df['sentiment'].value_counts()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "arOwMrEWvUwW",
"outputId": "473989d8-ca0b-4817-d8da-b5dc5dc641a4"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"positive 5028\n",
"negative 4972\n",
"Name: sentiment, dtype: int64"
]
},
"metadata": {},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"source": [
"df.isnull().sum()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "urgIjuGcvbDr",
"outputId": "efe91433-2817-442c-b6f0-bf2826b6abf5"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"review 0\n",
"sentiment 0\n",
"dtype: int64"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"df.duplicated().sum()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BDx3G8Luvnvv",
"outputId": "e1a44c02-d5e6-4a3d-f5e3-44bc5a7cd607"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"17"
]
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"source": [
"df.drop_duplicates(inplace=True)"
],
"metadata": {
"id": "SpGnnRdTvsku"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df.duplicated().sum()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4w35Dnttv7or",
"outputId": "aa716016-665a-4149-802b-88d62bfa6c65"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"source": [
"# Basic Preprocessing\n",
" - Remove tags - HTML\n",
" - Lower case\n",
" - remove stopwords"
],
"metadata": {
"id": "9lRaPvblv_Ge"
}
},
{
"cell_type": "code",
"source": [
"import re\n",
"def remove_tags(raw_text):\n",
" cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)\n",
" return cleaned_text"
],
"metadata": {
"id": "--LhGuR-v9PY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['review'] = df['review'].apply(remove_tags)"
],
"metadata": {
"id": "axy3acc2wVT_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "SUZPCBU1wYZP",
"outputId": "e665dc52-9236-411a-ca68-b5158f3a5be4"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. The filming tec... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive"
],
"text/html": [
"\n",
" <div id=\"df-9713cbf4-2258-4ff2-a8ad-05fc9776cb62\" class=\"colab-
df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. The filming tec...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\
n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <div class=\"colab-df-buttons\">\n",
"\n",
" <div class=\"colab-df-container\">\n",
" <button class=\"colab-df-convert\"
onclick=\"convertToInteractive('df-9713cbf4-2258-4ff2-a8ad-05fc9776cb62')\"\n",
" title=\"Convert this dataframe to an interactive
table.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"
viewBox=\"0 -960 960 960\">\n",
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-
160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-
160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-
160H620v160Z\"/>\n",
" </svg>\n",
" </button>\n",
"\n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" .colab-df-buttons div {\n",
" margin-bottom: 4px;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-9713cbf4-2258-4ff2-a8ad-
05fc9776cb62 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-9713cbf4-2258-
4ff2-a8ad-05fc9776cb62');\n",
" const dataTable =\n",
" await
google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\
n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
"\n",
"\n",
"<div id=\"df-d7110a63-8fc3-499b-93f4-c4e1cce91858\">\n",
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-
d7110a63-8fc3-499b-93f4-c4e1cce91858')\"\n",
" title=\"Suggest charts.\"\n",
" style=\"display:none;\">\n",
"\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0
0 24 24\"\n",
" width=\"24px\">\n",
" <g>\n",
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0
2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
" </g>\n",
"</svg>\n",
" </button>\n",
"\n",
"<style>\n",
" .colab-df-quickchart {\n",
" --bg-color: #E8F0FE;\n",
" --fill-color: #1967D2;\n",
" --hover-bg-color: #E2EBFA;\n",
" --hover-fill-color: #174EA6;\n",
" --disabled-fill-color: #AAA;\n",
" --disabled-bg-color: #DDD;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-quickchart {\n",
" --bg-color: #3B4455;\n",
" --fill-color: #D2E3FC;\n",
" --hover-bg-color: #434B5C;\n",
" --hover-fill-color: #FFFFFF;\n",
" --disabled-bg-color: #3B4455;\n",
" --disabled-fill-color: #666;\n",
" }\n",
"\n",
" .colab-df-quickchart {\n",
" background-color: var(--bg-color);\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: var(--fill-color);\n",
" height: 32px;\n",
" padding: 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-quickchart:hover {\n",
" background-color: var(--hover-bg-color);\n",
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: var(--button-hover-fill-color);\n",
" }\n",
"\n",
" .colab-df-quickchart-complete:disabled,\n",
" .colab-df-quickchart-complete:disabled:hover {\n",
" background-color: var(--disabled-bg-color);\n",
" fill: var(--disabled-fill-color);\n",
" box-shadow: none;\n",
" }\n",
"\n",
" .colab-df-spinner {\n",
" border: 2px solid var(--fill-color);\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" animation:\n",
" spin 1s steps(1) infinite;\n",
" }\n",
"\n",
" @keyframes spin {\n",
" 0% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" border-left-color: var(--fill-color);\n",
" }\n",
" 20% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 30% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 40% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 60% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 80% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" 90% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" }\n",
"</style>\n",
"\n",
" <script>\n",
" async function quickchart(key) {\n",
" const quickchartButtonEl =\n",
" document.querySelector('#' + key + ' button');\n",
" quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n",
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
" try {\n",
" const charts = await google.colab.kernel.invokeFunction(\n",
" 'suggestCharts', [key], {});\n",
" } catch (error) {\n",
" console.error('Error during call to suggestCharts:',
error);\n",
" }\n",
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
" quickchartButtonEl.classList.add('colab-df-quickchart-
complete');\n",
" }\n",
" (() => {\n",
" let quickchartButtonEl =\n",
" document.querySelector('#df-d7110a63-8fc3-499b-93f4-
c4e1cce91858 button');\n",
" quickchartButtonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
" })();\n",
" </script>\n",
"</div>\n",
" </div>\n",
" </div>\n"
]
},
"metadata": {},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"source": [
"df['review'][0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 174
},
"id": "vafADZyYwcpg",
"outputId": "51d8158e-62bf-45c5-f154-0ae5ed5f5532"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\"One of the other reviewers has mentioned that after watching just
1 Oz episode you'll be hooked. They are right, as this is exactly what happened
with me.The first thing that struck me about Oz was its brutality and unflinching
scenes of violence, which set in right from the word GO. Trust me, this is not a
show for the faint hearted or timid. This show pulls no punches with regards to
drugs, sex or violence. Its is hardcore, in the classic use of the word.It is
called OZ as that is the nickname given to the Oswald Maximum Security State
Penitentary. It focuses mainly on Emerald City, an experimental section of the
prison where all the cells have glass fronts and face inwards, so privacy is not
high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos,
Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings
and shady agreements are never far away.I would say the main appeal of the show is
due to the fact that it goes where other shows wouldn't dare. Forget pretty
pictures painted for mainstream audiences, forget charm, forget romance...OZ
doesn't mess around. The first episode I ever saw struck me as so nasty it was
surreal, I couldn't say I was ready for it, but as I watched more, I developed a
taste for Oz, and got accustomed to the high levels of graphic violence. Not just
violence, but injustice (crooked guards who'll be sold out for a nickel, inmates
who'll kill on order and get away with it, well mannered, middle class inmates
being turned into prison bitches due to their lack of street skills or prison
experience) Watching Oz, you may become comfortable with what is uncomfortable
viewing....thats if you can get in touch with your darker side.\""
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"source": [
"df['review'] = df['review'].apply(lambda x:x.lower())"
],
"metadata": {
"id": "VGKaOQHDwgmY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['review'][0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 174
},
"id": "7bYqQx-ywlqC",
"outputId": "87c9a554-777f-4a48-ad2b-4350fbddacd5"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\"one of the other reviewers has mentioned that after watching just
1 oz episode you'll be hooked. they are right, as this is exactly what happened
with me.the first thing that struck me about oz was its brutality and unflinching
scenes of violence, which set in right from the word go. trust me, this is not a
show for the faint hearted or timid. this show pulls no punches with regards to
drugs, sex or violence. its is hardcore, in the classic use of the word.it is
called oz as that is the nickname given to the oswald maximum security state
penitentary. it focuses mainly on emerald city, an experimental section of the
prison where all the cells have glass fronts and face inwards, so privacy is not
high on the agenda. em city is home to many..aryans, muslims, gangstas, latinos,
christians, italians, irish and more....so scuffles, death stares, dodgy dealings
and shady agreements are never far away.i would say the main appeal of the show is
due to the fact that it goes where other shows wouldn't dare. forget pretty
pictures painted for mainstream audiences, forget charm, forget romance...oz
doesn't mess around. the first episode i ever saw struck me as so nasty it was
surreal, i couldn't say i was ready for it, but as i watched more, i developed a
taste for oz, and got accustomed to the high levels of graphic violence. not just
violence, but injustice (crooked guards who'll be sold out for a nickel, inmates
who'll kill on order and get away with it, well mannered, middle class inmates
being turned into prison bitches due to their lack of street skills or prison
experience) watching oz, you may become comfortable with what is uncomfortable
viewing....thats if you can get in touch with your darker side.\""
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"source": [
"from nltk.corpus import stopwords\n",
"import nltk\n",
"\n",
"nltk.download('stopwords')\n",
"sw_list = stopwords.words('english')\n",
"\n",
"df['review'] = df['review'].apply(lambda x: [item for item in x.split() if
item not in sw_list]).apply(lambda x:\" \".join(x))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Vr6iAtOSwnNG",
"outputId": "bab64c91-9fb8-4dad-c842-aa2f98f4238d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df['review'][0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 174
},
"id": "eMnTPfTgw5_S",
"outputId": "4f32a5c1-7585-406b-943c-415159e91ff6"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\"one reviewers mentioned watching 1 oz episode hooked. right,
exactly happened me.the first thing struck oz brutality unflinching scenes
violence, set right word go. trust me, show faint hearted timid. show pulls punches
regards drugs, sex violence. hardcore, classic use word.it called oz nickname given
oswald maximum security state penitentary. focuses mainly emerald city,
experimental section prison cells glass fronts face inwards, privacy high agenda.
em city home many..aryans, muslims, gangstas, latinos, christians, italians, irish
more....so scuffles, death stares, dodgy dealings shady agreements never far away.i
would say main appeal show due fact goes shows dare. forget pretty pictures painted
mainstream audiences, forget charm, forget romance...oz mess around. first episode
ever saw struck nasty surreal, say ready it, watched more, developed taste oz, got
accustomed high levels graphic violence. violence, injustice (crooked guards who'll
sold nickel, inmates who'll kill order get away it, well mannered, middle class
inmates turned prison bitches due lack street skills prison experience) watching
oz, may become comfortable uncomfortable viewing....thats get touch darker side.\""
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 23
}
]
},
{
"cell_type": "code",
"source": [
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "Yz3Rz33Ew9wg",
"outputId": "c95e3c0c-418e-42c7-f09c-2ab46dbe731a"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" review sentiment\n",
"0 one reviewers mentioned watching 1 oz episode ... positive\n",
"1 wonderful little production. filming technique... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 basically there's family little boy (jake) thi... negative\n",
"4 petter mattei's \"love time money\" visually stu... positive"
],
"text/html": [
"\n",
" <div id=\"df-8d61a849-8ba7-4003-acb3-458857a95a13\" class=\"colab-
df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>one reviewers mentioned watching 1 oz episode ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wonderful little production. filming technique...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>basically there's family little boy (jake) thi...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>petter mattei's \"love time money\" visually stu...</td>\
n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <div class=\"colab-df-buttons\">\n",
"\n",
" <div class=\"colab-df-container\">\n",
" <button class=\"colab-df-convert\"
onclick=\"convertToInteractive('df-8d61a849-8ba7-4003-acb3-458857a95a13')\"\n",
" title=\"Convert this dataframe to an interactive
table.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"
viewBox=\"0 -960 960 960\">\n",
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-
160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-
160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-
160H620v160Z\"/>\n",
" </svg>\n",
" </button>\n",
"\n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" .colab-df-buttons div {\n",
" margin-bottom: 4px;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-8d61a849-8ba7-4003-acb3-
458857a95a13 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-8d61a849-8ba7-
4003-acb3-458857a95a13');\n",
" const dataTable =\n",
" await
google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\
n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
"\n",
"\n",
"<div id=\"df-993d8016-5132-41c2-9ad6-f0205320ae16\">\n",
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-
993d8016-5132-41c2-9ad6-f0205320ae16')\"\n",
" title=\"Suggest charts.\"\n",
" style=\"display:none;\">\n",
"\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0
0 24 24\"\n",
" width=\"24px\">\n",
" <g>\n",
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0
2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
" </g>\n",
"</svg>\n",
" </button>\n",
"\n",
"<style>\n",
" .colab-df-quickchart {\n",
" --bg-color: #E8F0FE;\n",
" --fill-color: #1967D2;\n",
" --hover-bg-color: #E2EBFA;\n",
" --hover-fill-color: #174EA6;\n",
" --disabled-fill-color: #AAA;\n",
" --disabled-bg-color: #DDD;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-quickchart {\n",
" --bg-color: #3B4455;\n",
" --fill-color: #D2E3FC;\n",
" --hover-bg-color: #434B5C;\n",
" --hover-fill-color: #FFFFFF;\n",
" --disabled-bg-color: #3B4455;\n",
" --disabled-fill-color: #666;\n",
" }\n",
"\n",
" .colab-df-quickchart {\n",
" background-color: var(--bg-color);\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: var(--fill-color);\n",
" height: 32px;\n",
" padding: 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-quickchart:hover {\n",
" background-color: var(--hover-bg-color);\n",
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: var(--button-hover-fill-color);\n",
" }\n",
"\n",
" .colab-df-quickchart-complete:disabled,\n",
" .colab-df-quickchart-complete:disabled:hover {\n",
" background-color: var(--disabled-bg-color);\n",
" fill: var(--disabled-fill-color);\n",
" box-shadow: none;\n",
" }\n",
"\n",
" .colab-df-spinner {\n",
" border: 2px solid var(--fill-color);\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" animation:\n",
" spin 1s steps(1) infinite;\n",
" }\n",
"\n",
" @keyframes spin {\n",
" 0% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" border-left-color: var(--fill-color);\n",
" }\n",
" 20% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 30% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 40% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 60% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 80% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" 90% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" }\n",
"</style>\n",
"\n",
" <script>\n",
" async function quickchart(key) {\n",
" const quickchartButtonEl =\n",
" document.querySelector('#' + key + ' button');\n",
" quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n",
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
" try {\n",
" const charts = await google.colab.kernel.invokeFunction(\n",
" 'suggestCharts', [key], {});\n",
" } catch (error) {\n",
" console.error('Error during call to suggestCharts:',
error);\n",
" }\n",
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
" quickchartButtonEl.classList.add('colab-df-quickchart-
complete');\n",
" }\n",
" (() => {\n",
" let quickchartButtonEl =\n",
" document.querySelector('#df-993d8016-5132-41c2-9ad6-
f0205320ae16 button');\n",
" quickchartButtonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
" })();\n",
" </script>\n",
"</div>\n",
" </div>\n",
" </div>\n"
]
},
"metadata": {},
"execution_count": 24
}
]
},
{
"cell_type": "code",
"source": [
"X = df.iloc[:,0:1]\n",
"y = df['sentiment']"
],
"metadata": {
"id": "Ek2vJLp3xIY3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "uobKmghhxSAk",
"outputId": "b8ec3f50-0dfa-4a0d-ab4a-747afc158dae"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" review\n",
"0 one reviewers mentioned watching 1 oz episode ...\n",
"1 wonderful little production. filming technique...\n",
"2 thought wonderful way spend time hot summer we...\n",
"3 basically there's family little boy (jake) thi...\n",
"4 petter mattei's \"love time money\" visually stu..."
],
"text/html": [
"\n",
" <div id=\"df-4e2127b4-d2c6-4ffd-b4cd-306511aca841\" class=\"colab-
df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>one reviewers mentioned watching 1 oz episode ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wonderful little production. filming technique...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>basically there's family little boy (jake) thi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>petter mattei's \"love time money\" visually stu...</td>\
n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <div class=\"colab-df-buttons\">\n",
"\n",
" <div class=\"colab-df-container\">\n",
" <button class=\"colab-df-convert\"
onclick=\"convertToInteractive('df-4e2127b4-d2c6-4ffd-b4cd-306511aca841')\"\n",
" title=\"Convert this dataframe to an interactive
table.\"\n",
" style=\"display:none;\">\n",
"\n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"
viewBox=\"0 -960 960 960\">\n",
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-
160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-
160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-
160H620v160Z\"/>\n",
" </svg>\n",
" </button>\n",
"\n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" .colab-df-buttons div {\n",
" margin-bottom: 4px;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-4e2127b4-d2c6-4ffd-b4cd-
306511aca841 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-4e2127b4-d2c6-
4ffd-b4cd-306511aca841');\n",
" const dataTable =\n",
" await
google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\
n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
"\n",
"\n",
"<div id=\"df-ed5d9ab4-6f72-4f7b-bb96-6aa1ec9c235c\">\n",
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-
ed5d9ab4-6f72-4f7b-bb96-6aa1ec9c235c')\"\n",
" title=\"Suggest charts.\"\n",
" style=\"display:none;\">\n",
"\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0
0 24 24\"\n",
" width=\"24px\">\n",
" <g>\n",
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0
2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
" </g>\n",
"</svg>\n",
" </button>\n",
"\n",
"<style>\n",
" .colab-df-quickchart {\n",
" --bg-color: #E8F0FE;\n",
" --fill-color: #1967D2;\n",
" --hover-bg-color: #E2EBFA;\n",
" --hover-fill-color: #174EA6;\n",
" --disabled-fill-color: #AAA;\n",
" --disabled-bg-color: #DDD;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-quickchart {\n",
" --bg-color: #3B4455;\n",
" --fill-color: #D2E3FC;\n",
" --hover-bg-color: #434B5C;\n",
" --hover-fill-color: #FFFFFF;\n",
" --disabled-bg-color: #3B4455;\n",
" --disabled-fill-color: #666;\n",
" }\n",
"\n",
" .colab-df-quickchart {\n",
" background-color: var(--bg-color);\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: var(--fill-color);\n",
" height: 32px;\n",
" padding: 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-quickchart:hover {\n",
" background-color: var(--hover-bg-color);\n",
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px
rgba(60, 64, 67, 0.15);\n",
" fill: var(--button-hover-fill-color);\n",
" }\n",
"\n",
" .colab-df-quickchart-complete:disabled,\n",
" .colab-df-quickchart-complete:disabled:hover {\n",
" background-color: var(--disabled-bg-color);\n",
" fill: var(--disabled-fill-color);\n",
" box-shadow: none;\n",
" }\n",
"\n",
" .colab-df-spinner {\n",
" border: 2px solid var(--fill-color);\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" animation:\n",
" spin 1s steps(1) infinite;\n",
" }\n",
"\n",
" @keyframes spin {\n",
" 0% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" border-left-color: var(--fill-color);\n",
" }\n",
" 20% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 30% {\n",
" border-color: transparent;\n",
" border-left-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 40% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-top-color: var(--fill-color);\n",
" }\n",
" 60% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" }\n",
" 80% {\n",
" border-color: transparent;\n",
" border-right-color: var(--fill-color);\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" 90% {\n",
" border-color: transparent;\n",
" border-bottom-color: var(--fill-color);\n",
" }\n",
" }\n",
"</style>\n",
"\n",
" <script>\n",
" async function quickchart(key) {\n",
" const quickchartButtonEl =\n",
" document.querySelector('#' + key + ' button');\n",
" quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n",
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
" try {\n",
" const charts = await google.colab.kernel.invokeFunction(\n",
" 'suggestCharts', [key], {});\n",
" } catch (error) {\n",
" console.error('Error during call to suggestCharts:',
error);\n",
" }\n",
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
" quickchartButtonEl.classList.add('colab-df-quickchart-
complete');\n",
" }\n",
" (() => {\n",
" let quickchartButtonEl =\n",
" document.querySelector('#df-ed5d9ab4-6f72-4f7b-bb96-
6aa1ec9c235c button');\n",
" quickchartButtonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
" })();\n",
" </script>\n",
"</div>\n",
" </div>\n",
" </div>\n"
]
},
"metadata": {},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"source": [
"y"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fKrU0aElxUER",
"outputId": "18729867-e2ca-46e5-883d-4c32cd8205a8"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 positive\n",
"1 positive\n",
"2 positive\n",
"3 negative\n",
"4 positive\n",
" ... \n",
"9995 positive\n",
"9996 negative\n",
"9997 negative\n",
"9998 negative\n",
"9999 positive\n",
"Name: sentiment, Length: 9983, dtype: object"
]
},
"metadata": {},
"execution_count": 27
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"encoder = LabelEncoder()\n",
"\n",
"y = encoder.fit_transform(y)"
],
"metadata": {
"id": "Rhdxh4DjxVIX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"y"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jsci276DxmGp",
"outputId": "a0d5030c-c5ee-41b0-e5f7-15a37db2bee4"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([1, 1, 1, ..., 0, 0, 1])"
]
},
"metadata": {},
"execution_count": 29
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train,X_test,y_train,y_test =
train_test_split(X,y,test_size=0.2,random_state=1)"
],
"metadata": {
"id": "VCwFziqZxmEy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X_train.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_nHUxV6mxmCL",
"outputId": "4cba7aed-495a-489a-e41c-21af59d7767b"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(7986, 1)"
]
},
"metadata": {},
"execution_count": 31
}
]
},
{
"cell_type": "code",
"source": [
"X_test.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Sjbsh1o2x0Y_",
"outputId": "0e513f93-ed46-4967-a9bb-6c48f9baf2a5"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(1997, 1)"
]
},
"metadata": {},
"execution_count": 32
}
]
},
{
"cell_type": "code",
"source": [
"# Applying BoW\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"cv = CountVectorizer()"
],
"metadata": {
"id": "8AODw1Mdx4z5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X_train_bow = cv.fit_transform(X_train['review']).toarray()\n",
"X_test_bow = cv.transform(X_test['review']).toarray()"
],
"metadata": {
"id": "eVKQIYZTyA_B"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X_train_bow"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q2QUPLcayL8o",
"outputId": "17add990-5a15-4780-819e-2ddd34ef37fe"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" ...,\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0]])"
]
},
"metadata": {},
"execution_count": 35
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb = GaussianNB()\n",
"\n",
"gnb.fit(X_train_bow,y_train)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 74
},
"id": "emjpq33fyOEP",
"outputId": "de77a7d3-8713-47a6-87d9-eef729a199f8"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"GaussianNB()"
],
"text/html": [
"<style>#sk-container-id-1 {color: black;background-color:
white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable
{background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor:
pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing:
border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-
arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-
container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-
container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before
{color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-
width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-
container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-
radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-
toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-
width: 100%;overflow: auto;}#sk-container-id-1 input.sk-
toggleable__control:checked~label.sk-toggleable__label-arrow:before
{content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-
toggleable__control:checked~label.sk-toggleable__label {background-color:
#d4ebff;}#sk-container-id-1 div.sk-label input.sk-
toggleable__control:checked~label.sk-toggleable__label {background-color:
#d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px
1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow:
hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-
estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted
black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-
container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-
1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid
gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label
{background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before
{content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-
box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial
{display: flex;flex-direction: column;align-items: center;background-color:
white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-
id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel
{display: flex;align-items: stretch;justify-content: center;background-color:
white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-
1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px
solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-
container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index:
1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-
item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1
div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-
container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1
div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-
sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-
1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-
block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align:
center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets
`[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !
important; }` so we also need the `!important` here to be able to override the
default hidden behavior on the sphinx rendered scikit-learn.org. See:
https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !
important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback
{display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-
container\"><div class=\"sk-text-repr-fallback\"><pre>GaussianNB()</pre><b>In a
Jupyter environment, please rerun this cell to show the HTML representation or
trust the notebook. <br />On GitHub, the HTML representation is unable to render,
please try loading this page with nbviewer.org.</b></div><div class=\"sk-
container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-
toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-
estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\"
class=\"sk-toggleable__label sk-toggleable__label-arrow\">GaussianNB</label><div
class=\"sk-toggleable__content\"><pre>GaussianNB()</pre></div></div></div></div></
div>"
]
},
"metadata": {},
"execution_count": 36
}
]
},
{
"cell_type": "code",
"source": [
"y_pred = gnb.predict(X_test_bow)\n",
"\n",
"from sklearn.metrics import accuracy_score,confusion_matrix\n",
"accuracy_score(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GXpZK8i2yarw",
"outputId": "84880879-f6f1-4889-e2f9-82791fdb64a7"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.6324486730095142"
]
},
"metadata": {},
"execution_count": 37
}
]
},
{
"cell_type": "code",
"source": [
"confusion_matrix(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FoGBiwG3ykKv",
"outputId": "da2fc109-844f-42bc-b41d-ff8e29ef8bca"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[717, 235],\n",
" [499, 546]])"
]
},
"metadata": {},
"execution_count": 38
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_bow,y_train)\n",
"\n",
"y_pred = rf.predict(X_test_bow)\n",
"\n",
"accuracy_score(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jSAnt1_GypjA",
"outputId": "d19e9852-efa4-4014-d90f-01208c54ba48"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8447671507260891"
]
},
"metadata": {},
"execution_count": 39
}
]
},
{
"cell_type": "code",
"source": [
"cv = CountVectorizer(max_features=3000)\n",
"\n",
"X_train_bow = cv.fit_transform(X_train['review']).toarray()\n",
"X_test_bow = cv.transform(X_test['review']).toarray()\n",
"\n",
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_bow,y_train)\n",
"y_pred = rf.predict(X_test_bow)\n",
"accuracy_score(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YTEogOQny17A",
"outputId": "70e2855b-04af-4b13-e17b-8b3269c8936b"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8417626439659489"
]
},
"metadata": {},
"execution_count": 40
}
]
},
{
"cell_type": "markdown",
"source": [
"# N grams"
],
"metadata": {
"id": "LKWGexU401w5"
}
},
{
"cell_type": "code",
"source": [
"cv = CountVectorizer(ngram_range=(1,2),max_features=5000)\n",
"\n",
"X_train_bow = cv.fit_transform(X_train['review']).toarray()\n",
"X_test_bow = cv.transform(X_test['review']).toarray()\n",
"\n",
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_bow,y_train)\n",
"y_pred = rf.predict(X_test_bow)\n",
"accuracy_score(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Xz8EmJLX0V5J",
"outputId": "9b302572-c8e1-4e33-96c9-15198111652b"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8432648973460191"
]
},
"metadata": {},
"execution_count": 41
}
]
},
{
"cell_type": "markdown",
"source": [
"TFIDF"
],
"metadata": {
"id": "JR5q6_g-0-lD"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf = TfidfVectorizer()"
],
"metadata": {
"id": "YfgbV3bx07C-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()\n",
"X_test_tfidf = tfidf.transform(X_test['review'])"
],
"metadata": {
"id": "mSzlYnS61CNv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_tfidf,y_train)\n",
"y_pred = rf.predict(X_test_tfidf)\n",
"\n",
"accuracy_score(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "NBdpfOkK1D3v",
"outputId": "0bdda395-294a-4539-d3de-fabc4a9d363e"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8512769153730596"
]
},
"metadata": {},
"execution_count": 44
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "DECUvGMT1GNH"
},
"execution_count": null,
"outputs": []
}
]
}