1.Text Preprocessing.ipynb
1.Text Preprocessing.ipynb
Dataset:\n","https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-
movie-reviews"],"metadata":{"id":"XQ7oje4Bik85"}},{"cell_type":"code","source":
["import pandas as pd"],"metadata":{"id":"aW61ohEtjx-H","executionInfo":
{"status":"ok","timestamp":1702116064666,"user_tz":-360,"elapsed":2,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}}},"execution_count":1,"outputs":[]},
{"cell_type":"code","source":["from google.colab import
drive\n","drive.mount('/content/drive')"],"metadata":{"colab":
{"base_uri":"https://localhost:8080/"},"id":"0vPwaIrwBeI7","executionInfo":
{"status":"ok","timestamp":1702116132301,"user_tz":-360,"elapsed":66872,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"97209553-78a4-41e5-83f6-
025da63bfdb7"},"execution_count":2,"outputs":
[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},
{"cell_type":"code","source":["%cd
/content/drive/MyDrive/005_BOKTIAR_AHMED_BAPPY/My_classes/DSM
1.0/ALL_NOTES/Day_02/data"],"metadata":{"colab":{"base_uri":"https://
localhost:8080/"},"id":"8VS5a_X0B5n5","executionInfo":
{"status":"ok","timestamp":1702116177831,"user_tz":-360,"elapsed":613,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"26a8ebbe-04f8-4552-bd2d-
8181b072f0a1"},"execution_count":3,"outputs":
[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/
005_BOKTIAR_AHMED_BAPPY/My_classes/DSM 1.0/ALL_NOTES/Day_02/data\n"]}]},
{"cell_type":"code","source":["!pwd"],"metadata":{"colab":{"base_uri":"https://
localhost:8080/"},"id":"QZU6QMAFB-Bk","executionInfo":
{"status":"ok","timestamp":1702116195595,"user_tz":-360,"elapsed":912,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"6aca1b2c-c008-402f-f508-
01d727ff269d"},"execution_count":4,"outputs":
[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/
005_BOKTIAR_AHMED_BAPPY/My_classes/DSM 1.0/ALL_NOTES/Day_02/data\n"]}]},
{"cell_type":"code","source":["!ls"],"metadata":{"colab":{"base_uri":"https://
localhost:8080/"},"id":"F5b9gqeCCAod","executionInfo":
{"status":"ok","timestamp":1702116204260,"user_tz":-360,"elapsed":935,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"be54a379-2981-4afd-c5c2-
cd52b8890316"},"execution_count":5,"outputs":
[{"output_type":"stream","name":"stdout","text":["'IMDB Dataset.csv' test.txt
'Twitter Sentiments.csv'\n"]}]},{"cell_type":"code","source":["# !unzip
/content/IMDB_Dataset.zip"],"metadata":{"colab":{"base_uri":"https://
localhost:8080/"},"id":"aSPG8SToiHrR","executionInfo":
{"status":"ok","timestamp":1694927240676,"user_tz":-360,"elapsed":713,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"c00f89aa-38b6-422e-e6c4-
172ae097c67d"},"execution_count":null,"outputs":
[{"output_type":"stream","name":"stdout","text":["Archive:
/content/IMDB_Dataset.zip\n"," inflating: IMDB Dataset.csv \n"]}]},
{"cell_type":"code","source":["data_path = \"IMDB Dataset.csv\""],"metadata":
{"id":"rFFr9P5Gj3la","executionInfo":
{"status":"ok","timestamp":1702116239440,"user_tz":-360,"elapsed":824,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}}},"execution_count":6,"outputs":[]},
{"cell_type":"code","execution_count":7,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:41.835385Z","iopub.status.busy":"2021-12-
31T06:31:41.834548Z","iopub.status.idle":"2021-12-
31T06:31:43.691014Z","shell.execute_reply":"2021-12-
31T06:31:43.689926Z","shell.execute_reply.started":"2021-12-
31T06:31:41.835329Z"},"id":"p3I8FoIcfXI1","executionInfo":
{"status":"ok","timestamp":1702116244801,"user_tz":-360,"elapsed":2905,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["df = pd.read_csv(data_path)"]},
{"cell_type":"code","execution_count":10,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:45.066838Z","iopub.status.busy":"2021-12-
31T06:31:45.066171Z","iopub.status.idle":"2021-12-
31T06:31:45.074943Z","shell.execute_reply":"2021-12-
31T06:31:45.074024Z","shell.execute_reply.started":"2021-12-
31T06:31:45.066796Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"Z2xd0TNcfXI2","executionInfo":
{"status":"ok","timestamp":1702116523358,"user_tz":-360,"elapsed":1057,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"c5302197-48cf-41aa-b175-
97c3d61f5c7a"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["(50000, 2)"]},"metadata":{},"execution_count":10}],"source":["df.shape"]},
{"cell_type":"code","source":["df.head()"],"metadata":{"colab":
{"base_uri":"https://
localhost:8080/","height":206},"id":"sVljHc0QjOtx","executionInfo":
{"status":"ok","timestamp":1702116248916,"user_tz":-360,"elapsed":12,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"118afd0d-f34b-4541-fb36-
ecd96950fabd"},"execution_count":9,"outputs":
[{"output_type":"execute_result","data":{"text/plain":["
review sentiment\n","0 One of the other reviewers has mentioned that ...
positive\n","1 A wonderful little production. <br /><br />The... positive\n","2
I thought this was a wonderful way to spend ti... positive\n","3 Basically
there's a family where a little boy ... negative\n","4 Petter Mattei's \"Love in
the Time of Money\" is... positive"],"text/html":["\n"," <div id=\"df-5cce1cc2-
ce17-4546-9a07-6daf87a978a4\" class=\"colab-df-container\">\n"," <div>\
n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n","
vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n","
vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-
align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\
n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n","
<th>review</th>\n"," <th>sentiment</th>\n"," </tr>\n"," </thead>\n","
<tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>One of the other reviewers
has mentioned that ...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\
n"," <th>1</th>\n"," <td>A wonderful little production. <br
/><br />The...</td>\n"," <td>positive</td>\n"," </tr>\n","
<tr>\n"," <th>2</th>\n"," <td>I thought this was a wonderful way to spend
ti...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\n","
<th>3</th>\n"," <td>Basically there's a family where a little boy ...</td>\
n"," <td>negative</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n","
<td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n","
<td>positive</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div
class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n","
<button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5cce1cc2-
ce17-4546-9a07-6daf87a978a4')\"\n"," title=\"Convert this dataframe to
an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\
n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-
160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-
160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\
n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n","
display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n","
background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\
n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n","
height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\
n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n","
box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67,
0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n","
margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n","
background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n","
[theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n","
box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px
1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\
n","\n"," <script>\n"," const buttonEl =\n","
document.querySelector('#df-5cce1cc2-ce17-4546-9a07-6daf87a978a4 button.colab-df-
convert');\n"," buttonEl.style.display =\n","
google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function
convertToInteractive(key) {\n"," const element =
document.querySelector('#df-5cce1cc2-ce17-4546-9a07-6daf87a978a4');\n","
const dataTable =\n"," await
google.colab.kernel.invokeFunction('convertToInteractive',\n","
[key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml
= 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n"," + ' to learn more about interactive tables.';\n","
element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n","
await google.colab.output.renderOutput(dataTable, element);\n"," const
docLink = document.createElement('div');\n"," docLink.innerHTML =
docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n","
</script>\n"," </div>\n","\n","\n","<div id=\"df-22aae921-c7fa-4a63-84dc-
00e4ae4bd99e\">\n","
<button class=\"colab-df-quickchart\" onclick=\"quickchart('df-22aae921-c7fa-
4a63-84dc-00e4ae4bd99e')\"\n"," title=\"Suggest charts\"\n","
style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\"
height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <g>\n","
<path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-
2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n"," </g>\n","</svg>\n","
</button>\n","\n","<style>\n"," .colab-df-quickchart {\n"," --bg-color:
#E8F0FE;\n"," --fill-color: #1967D2;\n"," --hover-bg-color: #E2EBFA;\n","
--hover-fill-color: #174EA6;\n"," --disabled-fill-color: #AAA;\n"," --
disabled-bg-color: #DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\
n"," --bg-color: #3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-
bg-color: #434B5C;\n"," --hover-fill-color: #FFFFFF;\n"," --disabled-bg-
color: #3B4455;\n"," --disabled-fill-color: #666;\n"," }\n","\n"," .colab-
df-quickchart {\n"," background-color: var(--bg-color);\n"," border: none;\
n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n","
fill: var(--fill-color);\n"," height: 32px;\n"," padding: 0;\n"," width:
32px;\n"," }\n","\n"," .colab-df-quickchart:hover {\n"," background-color:
var(--hover-bg-color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px
3px 1px rgba(60, 64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n","
}\n","\n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-
complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n","
fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\
n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n","
border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n","
animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin
{\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color:
var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n","
20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-
color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n","
border-color: transparent;\n"," border-left-color: var(--fill-color);\n","
border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-
color);\n"," }\n"," 40% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-
color);\n"," }\n"," 60% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-
color: transparent;\n"," border-right-color: var(--fill-color);\n","
border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-
color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\
n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\
n"," const quickchartButtonEl =\n"," document.querySelector('#' + key +
' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n","
try {\n"," const charts = await google.colab.kernel.invokeFunction(\n","
'suggestCharts', [key], {});\n"," } catch (error) {\n","
console.error('Error during call to suggestCharts:', error);\n"," }\n","
quickchartButtonEl.classList.remove('colab-df-spinner');\n","
quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n","
(() => {\n"," let quickchartButtonEl =\n","
document.querySelector('#df-22aae921-c7fa-4a63-84dc-00e4ae4bd99e button');\n","
quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ?
'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n","
</div>\n"]},"metadata":{},"execution_count":9}]},{"cell_type":"code","source":["df
= df.head(100)"],"metadata":{"id":"eZCM9UfFmf1M","executionInfo":
{"status":"ok","timestamp":1702116561702,"user_tz":-360,"elapsed":606,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}}},"execution_count":11,"outputs":[]},
{"cell_type":"code","source":["df.shape"],"metadata":{"colab":
{"base_uri":"https://localhost:8080/"},"id":"9O0EVPWxmvEk","executionInfo":
{"status":"ok","timestamp":1702116565709,"user_tz":-360,"elapsed":880,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"5d715949-89d4-427f-ab64-
d2ab8ce784f0"},"execution_count":12,"outputs":
[{"output_type":"execute_result","data":{"text/plain":["(100, 2)"]},"metadata":
{},"execution_count":12}]},{"cell_type":"code","execution_count":13,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:31:45.928154Z","iopub.status.busy":"2021-12-
31T06:31:45.927888Z","iopub.status.idle":"2021-12-
31T06:31:45.945137Z","shell.execute_reply":"2021-12-
31T06:31:45.944403Z","shell.execute_reply.started":"2021-12-
31T06:31:45.928126Z"},"colab":{"base_uri":"https://
localhost:8080/","height":206},"id":"IK472BClfXI2","executionInfo":
{"status":"ok","timestamp":1702116588435,"user_tz":-360,"elapsed":1394,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"afcc59c2-55b0-4f3a-a6c3-
8487a7b7c57d"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["
review sentiment\n","0 One of the other reviewers has mentioned that ...
positive\n","1 A wonderful little production. <br /><br />The... positive\n","2
I thought this was a wonderful way to spend ti... positive\n","3 Basically
there's a family where a little boy ... negative\n","4 Petter Mattei's \"Love in
the Time of Money\" is... positive"],"text/html":["\n"," <div id=\"df-5a2a5a76-
3e99-46ad-9af7-09d697b12002\" class=\"colab-df-container\">\n"," <div>\
n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n","
vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n","
vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-
align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\
n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n","
<th>review</th>\n"," <th>sentiment</th>\n"," </tr>\n"," </thead>\n","
<tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>One of the other reviewers
has mentioned that ...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\
n"," <th>1</th>\n"," <td>A wonderful little production. <br
/><br />The...</td>\n"," <td>positive</td>\n"," </tr>\n","
<tr>\n"," <th>2</th>\n"," <td>I thought this was a wonderful way to spend
ti...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\n","
<th>3</th>\n"," <td>Basically there's a family where a little boy ...</td>\
n"," <td>negative</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n","
<td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n","
<td>positive</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div
class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n","
<button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5a2a5a76-
3e99-46ad-9af7-09d697b12002')\"\n"," title=\"Convert this dataframe to
an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\
n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-
160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-
160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\
n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n","
display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n","
background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\
n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n","
height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\
n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n","
box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67,
0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n","
margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n","
background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n","
[theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n","
box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px
1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\
n","\n"," <script>\n"," const buttonEl =\n","
document.querySelector('#df-5a2a5a76-3e99-46ad-9af7-09d697b12002 button.colab-df-
convert');\n"," buttonEl.style.display =\n","
google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function
convertToInteractive(key) {\n"," const element =
document.querySelector('#df-5a2a5a76-3e99-46ad-9af7-09d697b12002');\n","
const dataTable =\n"," await
google.colab.kernel.invokeFunction('convertToInteractive',\n","
[key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml
= 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data
table notebook</a>'\n"," + ' to learn more about interactive tables.';\
n"," element.innerHTML = '';\n"," dataTable['output_type'] =
'display_data';\n"," await google.colab.output.renderOutput(dataTable,
element);\n"," const docLink = document.createElement('div');\n","
docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n","
}\n"," </script>\n"," </div>\n","\n","\n","<div id=\"df-4fb431f4-e92e-4030-
b4f4-f760f1a3f2dc\">\n"," <button class=\"colab-df-quickchart\"
onclick=\"quickchart('df-4fb431f4-e92e-4030-b4f4-f760f1a3f2dc')\"\n","
title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","
width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0
1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-
2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\
n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color:
#1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color:
#174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color:
#DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color:
#3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n","
--hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --
disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n","
background-color: var(--bg-color);\n"," border: none;\n"," border-radius:
50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-
color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\
n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-
color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60,
64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\
n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-
complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n","
fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\
n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n","
border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n","
animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin
{\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color:
var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n","
20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-
color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n","
border-color: transparent;\n"," border-left-color: var(--fill-color);\n","
border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-
color);\n"," }\n"," 40% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-
color);\n"," }\n"," 60% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-
color: transparent;\n"," border-right-color: var(--fill-color);\n","
border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-
color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\
n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\
n"," const quickchartButtonEl =\n"," document.querySelector('#' + key +
' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n","
try {\n"," const charts = await google.colab.kernel.invokeFunction(\n","
'suggestCharts', [key], {});\n"," } catch (error) {\n","
console.error('Error during call to suggestCharts:', error);\n"," }\n","
quickchartButtonEl.classList.remove('colab-df-spinner');\n","
quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n","
(() => {\n"," let quickchartButtonEl =\n","
document.querySelector('#df-4fb431f4-e92e-4030-b4f4-f760f1a3f2dc button');\n","
quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ?
'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n","
</div>\n"]},"metadata":{},"execution_count":13}],"source":["df.head()"]},
{"cell_type":"markdown","metadata":{"id":"2904AVZxfXI3"},"source":["# lower
case"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:46.622825Z","iopub.status.busy":"2021-12-
31T06:31:46.622155Z","iopub.status.idle":"2021-12-
31T06:31:46.631721Z","shell.execute_reply":"2021-12-
31T06:31:46.631006Z","shell.execute_reply.started":"2021-12-
31T06:31:46.622784Z"},"colab":{"base_uri":"https://
localhost:8080/","height":139},"id":"GmzqPSelfXI_","executionInfo":
{"status":"ok","timestamp":1702116770355,"user_tz":-360,"elapsed":879,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"360c6053-5c6b-4614-8f56-
67e3c7034af2"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["\"Basically there's a family where a little boy (Jake) thinks there's a zombie in
his closet & his parents are fighting all the time.<br /><br />This movie is slower
than a soap opera... and suddenly, Jake decides to become Rambo and kill the
zombie.<br /><br />OK, first of all when you're going to make a film you must
Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are
divorcing & arguing like in real life. And then we have Jake with his closet which
totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and
instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of
10 just for the well playing parents & descent dialogs. As for the shots with Jake:
just ignore them.\""],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":14}],"source":["df['review']
[3]"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:47.787087Z","iopub.status.busy":"2021-12-
31T06:31:47.786303Z","iopub.status.idle":"2021-12-
31T06:31:47.951308Z","shell.execute_reply":"2021-12-
31T06:31:47.950466Z","shell.execute_reply.started":"2021-12-
31T06:31:47.787050Z"},"id":"i89hq6VMfXJB","executionInfo":
{"status":"ok","timestamp":1702116806371,"user_tz":-360,"elapsed":577,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["df['review'] = df['review'].str.lower()"]},
{"cell_type":"code","execution_count":16,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:49.128530Z","iopub.status.busy":"2021-12-
31T06:31:49.128210Z","iopub.status.idle":"2021-12-
31T06:31:49.139811Z","shell.execute_reply":"2021-12-
31T06:31:49.139029Z","shell.execute_reply.started":"2021-12-
31T06:31:49.128496Z"},"colab":{"base_uri":"https://
localhost:8080/","height":423},"id":"glDKwJu4fXJC","executionInfo":
{"status":"ok","timestamp":1702116810079,"user_tz":-360,"elapsed":846,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"c65bd924-5a8d-44fd-b86f-
8d9b838e24e0"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["
review sentiment\n","0 one of the other reviewers has mentioned that ...
positive\n","1 a wonderful little production. <br /><br />the... positive\n","2
i thought this was a wonderful way to spend ti... positive\n","3 basically
there's a family where a little boy ... negative\n","4 petter mattei's \"love in
the time of money\" is... positive\n","..
... ...\n","95 daniel day-lewis is the most versatile actor a... positive\
n","96 my guess would be this was originally going to... negative\n","97 well, i
like to watch bad horror b-movies, cau... negative\n","98 this is the worst movie
i have ever seen, as w... negative\n","99 i have been a mario fan for as long as
i can r... positive\n","\n","[100 rows x 2 columns]"],"text/html":["\n"," <div
id=\"df-3e7f1e78-674e-4244-b3d9-796b2b4138ac\" class=\"colab-df-container\">\n","
<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n","
vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n","
vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-
align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\
n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n","
<th>review</th>\n"," <th>sentiment</th>\n"," </tr>\n"," </thead>\n","
<tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>one of the other reviewers
has mentioned that ...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\
n"," <th>1</th>\n"," <td>a wonderful little production. <br
/><br />the...</td>\n"," <td>positive</td>\n"," </tr>\n","
<tr>\n"," <th>2</th>\n"," <td>i thought this was a wonderful way to spend
ti...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\n","
<th>3</th>\n"," <td>basically there's a family where a little boy ...</td>\
n"," <td>negative</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n","
<td>petter mattei's \"love in the time of money\" is...</td>\n","
<td>positive</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n","
<td>...</td>\n","
<td>...</td>\n"," </tr>\n"," <tr>\n"," <th>95</th>\n","
<td>daniel day-lewis is the most versatile actor a...</td>\n","
<td>positive</td>\n"," </tr>\n"," <tr>\n"," <th>96</th>\n"," <td>my
guess would be this was originally going to...</td>\n"," <td>negative</td>\
n"," </tr>\n"," <tr>\n"," <th>97</th>\n"," <td>well, i like to
watch bad horror b-movies, cau...</td>\n"," <td>negative</td>\n"," </tr>\
n"," <tr>\n"," <th>98</th>\n"," <td>this is the worst movie i have
ever seen, as w...</td>\n"," <td>negative</td>\n"," </tr>\n"," <tr>\n","
<th>99</th>\n"," <td>i have been a mario fan for as long as i can r...</td>\
n"," <td>positive</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>100
rows × 2 columns</p>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n","
<div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\"
onclick=\"convertToInteractive('df-3e7f1e78-674e-4244-b3d9-796b2b4138ac')\"\n","
title=\"Convert this dataframe to an interactive table.\"\n","
style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\"
height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-
720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-
160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-
160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n","
<style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\
n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\
n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\
n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n","
padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-
convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px
2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill:
#174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom:
4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-
color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n","
[theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n","
box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px
1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\
n","\n"," <script>\n"," const buttonEl =\n","
document.querySelector('#df-3e7f1e78-674e-4244-b3d9-796b2b4138ac button.colab-df-
convert');\n"," buttonEl.style.display =\n","
google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function
convertToInteractive(key) {\n"," const element =
document.querySelector('#df-3e7f1e78-674e-4244-b3d9-796b2b4138ac');\n","
const dataTable =\n"," await
google.colab.kernel.invokeFunction('convertToInteractive',\n","
[key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml
= 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n"," + ' to learn more about interactive tables.';\n","
element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n","
await google.colab.output.renderOutput(dataTable, element);\n"," const
docLink = document.createElement('div');\n"," docLink.innerHTML =
docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n","
</script>\n"," </div>\n","\n","\n","<div id=\"df-3043c2d6-11c7-4f50-b440-
156d279776a5\">\n"," <button class=\"colab-df-quickchart\"
onclick=\"quickchart('df-3043c2d6-11c7-4f50-b440-156d279776a5')\"\n","
title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","
width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0
1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-
2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\
n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color:
#1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color:
#174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color:
#DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color:
#3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n","
--hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --
disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n","
background-color: var(--bg-color);\n"," border: none;\n"," border-radius:
50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-
color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\
n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-
color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60,
64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\
n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-
complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n","
fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\
n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n","
border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n","
animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin
{\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color:
var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n","
20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-
color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n","
border-color: transparent;\n"," border-left-color: var(--fill-color);\n","
border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-
color);\n"," }\n"," 40% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-
color);\n"," }\n"," 60% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-
color: transparent;\n"," border-right-color: var(--fill-color);\n","
border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-
color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\
n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\
n"," const quickchartButtonEl =\n"," document.querySelector('#' + key +
' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n","
try {\n"," const charts = await google.colab.kernel.invokeFunction(\n","
'suggestCharts', [key], {});\n"," } catch (error) {\n","
console.error('Error during call to suggestCharts:', error);\n"," }\n","
quickchartButtonEl.classList.remove('colab-df-spinner');\n","
quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n","
(() => {\n"," let quickchartButtonEl =\n","
document.querySelector('#df-3043c2d6-11c7-4f50-b440-156d279776a5 button');\n","
quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ?
'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n","
</div>\n"]},"metadata":{},"execution_count":16}],"source":["df"]},
{"cell_type":"code","source":["df['review'][3]"],"metadata":{"colab":
{"base_uri":"https://localhost:8080/","height":139},"id":"Qw-
VymfykdQg","executionInfo":{"status":"ok","timestamp":1702116816023,"user_tz":-
360,"elapsed":595,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"c08539d4-c8f8-450c-a678-
f7e0564a3937"},"execution_count":17,"outputs":
[{"output_type":"execute_result","data":{"text/plain":["\"basically there's a
family where a little boy (jake) thinks there's a zombie in his closet & his
parents are fighting all the time.<br /><br />this movie is slower than a soap
opera... and suddenly, jake decides to become rambo and kill the zombie.<br
/><br />ok, first of all when you're going to make a film you must decide if its a
thriller or a drama! as a drama the movie is watchable. parents are divorcing &
arguing like in real life. and then we have jake with his closet which totally
ruins all the film! i expected to see a boogeyman similar movie, and instead i
watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just
for the well playing parents & descent dialogs. as for the shots with jake: just
ignore them.\""],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":17}]},
{"cell_type":"markdown","metadata":{"id":"q6PdP4pvfXJD"},"source":["#
remove_html_tags"]},{"cell_type":"code","execution_count":18,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:31:50.312485Z","iopub.status.busy":"2021-12-
31T06:31:50.311881Z","iopub.status.idle":"2021-12-
31T06:31:50.317355Z","shell.execute_reply":"2021-12-
31T06:31:50.316605Z","shell.execute_reply.started":"2021-12-
31T06:31:50.312441Z"},"id":"TJBPXPKffXJE","executionInfo":
{"status":"ok","timestamp":1702116970620,"user_tz":-360,"elapsed":857,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}}},"outputs":[],"source":["import re\
n","def remove_html_tags(text):\n"," pattern = re.compile('<.*?>')\n","
return pattern.sub(r'', text)"]},
{"cell_type":"code","execution_count":19,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:51.213645Z","iopub.status.busy":"2021-12-
31T06:31:51.212983Z","iopub.status.idle":"2021-12-
31T06:31:51.217270Z","shell.execute_reply":"2021-12-
31T06:31:51.216678Z","shell.execute_reply.started":"2021-12-
31T06:31:51.213613Z"},"id":"ULnX84uNfXJE","executionInfo":
{"status":"ok","timestamp":1702116972611,"user_tz":-360,"elapsed":5,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["text = \"<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p>
Click here to <a href='http://google.com'>download</a></p></body></html>\""]},
{"cell_type":"code","execution_count":20,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:52.830688Z","iopub.status.busy":"2021-12-
31T06:31:52.830402Z","iopub.status.idle":"2021-12-
31T06:31:52.836008Z","shell.execute_reply":"2021-12-
31T06:31:52.835403Z","shell.execute_reply.started":"2021-12-
31T06:31:52.830656Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"qNvDbZB0fXJF","executionInfo":
{"status":"ok","timestamp":1702116976737,"user_tz":-360,"elapsed":718,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"40f49d2d-e060-4550-ff28-
fa4feb4af331"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["'
Movie 1 Actor - Aamir Khan Click here to
download'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":20}],"source":
["remove_html_tags(text)"]},{"cell_type":"code","execution_count":21,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:31:56.044835Z","iopub.status.busy":"2021-12-
31T06:31:56.044290Z","iopub.status.idle":"2021-12-
31T06:31:56.261722Z","shell.execute_reply":"2021-12-
31T06:31:56.260834Z","shell.execute_reply.started":"2021-12-
31T06:31:56.044797Z"},"id":"yC7UIrYBfXJF","executionInfo":
{"status":"ok","timestamp":1702116996894,"user_tz":-360,"elapsed":591,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["df['review'] = df['review'].apply(remove_html_tags)"]},
{"cell_type":"code","execution_count":22,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:57.128227Z","iopub.status.busy":"2021-12-
31T06:31:57.127915Z","iopub.status.idle":"2021-12-
31T06:31:57.135752Z","shell.execute_reply":"2021-12-
31T06:31:57.134890Z","shell.execute_reply.started":"2021-12-
31T06:31:57.128194Z"},"colab":{"base_uri":"https://
localhost:8080/","height":122},"id":"8dC4Wn-CfXJG","executionInfo":
{"status":"ok","timestamp":1702117006643,"user_tz":-360,"elapsed":608,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"e2ebf177-4500-47a1-a570-
14fbb424fb2f"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'probably my all-time favorite movie, a story of selflessness, sacrifice and
dedication to a noble cause, but it\\'s not preachy or boring. it just never gets
old, despite my having seen it some 15 or more times in the last 25 years. paul
lukas\\' performance brings tears to my eyes, and bette davis, in one of her very
few truly sympathetic roles, is a delight. the kids are, as grandma says, more like
\"dressed-up midgets\" than children, but that only makes them more fun to watch.
and the mother\\'s slow awakening to what\\'s happening in the world and under her
own roof is believable and startling. if i had a dozen thumbs, they\\'d all
be \"up\" for this movie.'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":22}],"source":["df['review']
[5]"]},{"cell_type":"markdown","metadata":{"id":"UdMkoipYfXJG"},"source":["#
remove_url"]},{"cell_type":"code","execution_count":23,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:31:59.535009Z","iopub.status.busy":"2021-12-
31T06:31:59.534577Z","iopub.status.idle":"2021-12-
31T06:31:59.538828Z","shell.execute_reply":"2021-12-
31T06:31:59.537932Z","shell.execute_reply.started":"2021-12-
31T06:31:59.534968Z"},"id":"3BmkkiKWfXJG","executionInfo":
{"status":"ok","timestamp":1702117068152,"user_tz":-360,"elapsed":606,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["def remove_url(text):\n"," pattern = re.compile(r'https?://\\S+|
www\\.\\S+')\n"," return pattern.sub(r'', text)"]},
{"cell_type":"code","execution_count":24,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:32:01.777326Z","iopub.status.busy":"2021-12-
31T06:32:01.776896Z","iopub.status.idle":"2021-12-
31T06:32:01.782039Z","shell.execute_reply":"2021-12-
31T06:32:01.781169Z","shell.execute_reply.started":"2021-12-
31T06:32:01.777289Z"},"id":"M2qrlgUVfXJH","executionInfo":
{"status":"ok","timestamp":1702117082143,"user_tz":-360,"elapsed":848,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["text1 = 'Check out my youtube https://www.youtube.com/dswithbappy
dswithbappy'\n","text2 = 'Check out my linkedin
https://www.linkedin.com/in/boktiarahmed73/'\n","text3 = 'Google search here
www.google.com'\n","text4 = 'For data click https://www.kaggle.com/'"]},
{"cell_type":"code","execution_count":26,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:32:13.347025Z","iopub.status.busy":"2021-12-
31T06:32:13.346432Z","iopub.status.idle":"2021-12-
31T06:32:13.352108Z","shell.execute_reply":"2021-12-
31T06:32:13.351556Z","shell.execute_reply.started":"2021-12-
31T06:32:13.346979Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"o3wQdDYGfXJI","executionInfo":
{"status":"ok","timestamp":1702117099378,"user_tz":-360,"elapsed":619,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"8a778636-1898-48aa-b5de-
ac78158f38ee"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'Check out my linkedin '"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":26}],"source":
["remove_url(text2)"]},{"cell_type":"markdown","metadata":{"id":"NxXTlMd-
fXJI"},"source":["# punctuation handling"]},
{"cell_type":"code","execution_count":27,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:32:15.220391Z","iopub.status.busy":"2021-12-
31T06:32:15.219430Z","iopub.status.idle":"2021-12-
31T06:32:15.226608Z","shell.execute_reply":"2021-12-
31T06:32:15.225774Z","shell.execute_reply.started":"2021-12-
31T06:32:15.220317Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"9PqZWxf9fXJJ","executionInfo":
{"status":"ok","timestamp":1702117155911,"user_tz":-360,"elapsed":604,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"3401aab1-a446-4951-fa2f-
cb850ac2e539"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"],"application/
vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":
{},"execution_count":27}],"source":["import string,time\n","string.punctuation"]},
{"cell_type":"code","execution_count":28,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:32:17.479876Z","iopub.status.busy":"2021-12-
31T06:32:17.479110Z","iopub.status.idle":"2021-12-
31T06:32:17.482866Z","shell.execute_reply":"2021-12-
31T06:32:17.482313Z","shell.execute_reply.started":"2021-12-
31T06:32:17.479840Z"},"id":"HeyKLJu_fXJJ","executionInfo":
{"status":"ok","timestamp":1702117343261,"user_tz":-360,"elapsed":860,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}},"colab":
{"base_uri":"https://localhost:8080/","height":35},"outputId":"4f77d47f-e303-4931-
ef42-79df294f45f5"},"outputs":[{"output_type":"execute_result","data":{"text/
plain":["'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"],"application/
vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":
{},"execution_count":28}],"source":["exclude = string.punctuation\n","exclude"]},
{"cell_type":"code","execution_count":29,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:32:55.078670Z","iopub.status.busy":"2021-12-
31T06:32:55.078307Z","iopub.status.idle":"2021-12-
31T06:32:55.083271Z","shell.execute_reply":"2021-12-
31T06:32:55.082403Z","shell.execute_reply.started":"2021-12-
31T06:32:55.078636Z"},"id":"KkB59_vLfXJK","executionInfo":
{"status":"ok","timestamp":1702117378675,"user_tz":-360,"elapsed":871,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["def remove_punc(text):\n"," for char in exclude:\n"," text
= text.replace(char,'')\n"," return text\n"]},
{"cell_type":"code","execution_count":30,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:33:45.243742Z","iopub.status.busy":"2021-12-
31T06:33:45.243469Z","iopub.status.idle":"2021-12-
31T06:33:45.247986Z","shell.execute_reply":"2021-12-
31T06:33:45.247407Z","shell.execute_reply.started":"2021-12-
31T06:33:45.243713Z"},"id":"Xc5SxdEhfXJK","executionInfo":
{"status":"ok","timestamp":1702117381990,"user_tz":-360,"elapsed":913,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["text = 'string. With. Punctuation?'"]},
{"cell_type":"code","execution_count":31,"metadata":{"execution":
{"iopub.execute_input":"2021-12-08T11:47:24.652052Z","iopub.status.busy":"2021-12-
08T11:47:24.651733Z","iopub.status.idle":"2021-12-
08T11:47:24.658283Z","shell.execute_reply":"2021-12-
08T11:47:24.65749Z","shell.execute_reply.started":"2021-12-
08T11:47:24.652024Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"UhFUBAhxfXJK","executionInfo":
{"status":"ok","timestamp":1702117395856,"user_tz":-360,"elapsed":843,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"a1405107-f0bf-4f5a-cc89-
7c83300f2fa5"},"outputs":[{"output_type":"stream","name":"stdo
ut","text":["string With Punctuation\n","10.788440704345703\n"]}],"source":["start
= time.time()\n","print(remove_punc(text))\n","time1 = time.time() - start\
n","print(time1*50000)"]},{"cell_type":"code","execution_count":32,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:35:41.647925Z","iopub.status.busy":"2021-12-
31T06:35:41.647652Z","iopub.status.idle":"2021-12-
31T06:35:41.652722Z","shell.execute_reply":"2021-12-
31T06:35:41.651839Z","shell.execute_reply.started":"2021-12-
31T06:35:41.647898Z"},"id":"mig_Sd1zfXJL","executionInfo":
{"status":"ok","timestamp":1702117566123,"user_tz":-360,"elapsed":835,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["def remove_punc1(text):\n"," return
text.translate(str.maketrans('', '', exclude))"]},
{"cell_type":"code","execution_count":33,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:35:44.574532Z","iopub.status.busy":"2021-12-
31T06:35:44.574025Z","iopub.status.idle":"2021-12-
31T06:35:44.579559Z","shell.execute_reply":"2021-12-
31T06:35:44.578868Z","shell.execute_reply.started":"2021-12-
31T06:35:44.574499Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"DPUZ5MmSfXJL","executionInfo":
{"status":"ok","timestamp":1702117569870,"user_tz":-360,"elapsed":621,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"3972fb50-fc55-483d-9cef-
67b00132a12e"},"outputs":[{"output_type":"stream","name":"stdout","text":
["9.989738464355469\n"]}],"source":["start = time.time()\n","remove_punc1(text)\
n","time2 = time.time() - start\n","print(time2*50000)"]},
{"cell_type":"code","execution_count":34,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:35:50.343180Z","iopub.status.busy":"2021-12-
31T06:35:50.342776Z","iopub.status.idle":"2021-12-
31T06:35:50.411964Z","shell.execute_reply":"2021-12-
31T06:35:50.410707Z","shell.execute_reply.started":"2021-12-
31T06:35:50.343149Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"rxmeooiwfXJL","executionInfo":
{"status":"ok","timestamp":1702117581475,"user_tz":-360,"elapsed":873,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"0b895f23-4d9f-4052-cd26-
5edeab5d2e38"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["1.0799522673031026"]},"metadata":{},"execution_count":34}],"source":["time1/
time2"]},{"cell_type":"code","execution_count":35,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:36:00.861965Z","iopub.status.busy":"2021-12-
31T06:36:00.861687Z","iopub.status.idle":"2021-12-
31T06:36:00.867746Z","shell.execute_reply":"2021-12-
31T06:36:00.867162Z","shell.execute_reply.started":"2021-12-
31T06:36:00.861937Z"},"colab":{"base_uri":"https://
localhost:8080/","height":122},"id":"T-eMG-S5fXJM","executionInfo":
{"status":"ok","timestamp":1702117681656,"user_tz":-360,"elapsed":608,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"d5443a2e-473a-4d2f-a6d8-
c7453fade25c"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'probably my all-time favorite movie, a story of selflessness, sacrifice and
dedication to a noble cause, but it\\'s not preachy or boring. it just never gets
old, despite my having seen it some 15 or more times in the last 25 years. paul
lukas\\' performance brings tears to my eyes, and bette davis, in one of her very
few truly sympathetic roles, is a delight. the kids are, as grandma says, more like
\"dressed-up midgets\" than children, but that only makes them more fun to watch.
and the mother\\'s slow awakening to what\\'s happening in the world and under her
own roof is believable and startling. if i had a dozen thumbs, they\\'d all
be \"up\" for this movie.'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":35}],"source":["df['review']
[5]"]},{"cell_type":"code","execution_count":36,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:36:15.841878Z","iopub.status.busy":"2021-12-
31T06:36:15.841151Z","iopub.status.idle":"2021-12-
31T06:36:15.847766Z","shell.execute_reply":"2021-12-
31T06:36:15.847100Z","shell.execute_reply.started":"2021-12-
31T06:36:15.841838Z"},"id":"y1378-1cfXJM","executionInfo":
{"status":"ok","timestamp":1702117691051,"user_tz":-360,"elapsed":861,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}},"colab":
{"base_uri":"https://localhost:8080/","height":122},"outputId":"90c512e5-60ca-48d0-
bbca-19c4f0a94299"},"outputs":[{"output_type":"execute_result","data":{"text/
plain":["'probably my alltime favorite movie a story of selflessness sacrifice and
dedication to a noble cause but its not preachy or boring it just never gets old
despite my having seen it some 15 or more times in the last 25 years paul lukas
performance brings tears to my eyes and bette davis in one of her very few truly
sympathetic roles is a delight the kids are as grandma says more like dressedup
midgets than children but that only makes them more fun to watch and the mothers
slow awakening to whats happening in the world and under her own roof is believable
and startling if i had a dozen thumbs theyd all be up for this
movie'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":36}],"source":
["remove_punc1(df['review'][5])"]},{"cell_type":"markdown","metadata":
{"id":"5SBlyRsrfXJM"},"source":["# chat_conversion handle"]},
{"cell_type":"code","execution_count":37,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:36:22.728415Z","iopub.status.busy":"2021-12-
31T06:36:22.727766Z","iopub.status.idle":"2021-12-
31T06:36:22.743070Z","shell.execute_reply":"2021-12-
31T06:36:22.742046Z","shell.execute_reply.started":"2021-12-
31T06:36:22.728375Z"},"id":"nXzadIiQfXJM","executionInfo":
{"status":"ok","timestamp":1702117929121,"user_tz":-360,"elapsed":854,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["chat_words = {\n"," 'AFAIK':'As Far As I Know',\n","
'AFK':'Away From Keyboard',\n"," 'ASAP':'As Soon As Possible'\n","}\n","\n","\
n","{\n"," \"FYI\": \"For Your Information\",\n"," \"ASAP\": \"As Soon As
Possible\",\n"," \"BRB\": \"Be Right Back\",\n"," \"BTW\": \"By The Way\",\
n"," \"OMG\": \"Oh My God\",\n"," \"IMO\": \"In My Opinion\",\
n"," \"LOL\": \"Laugh Out Loud\",\n"," \"TTYL\": \"Talk To You Later\",\n","
\"GTG\": \"Got To Go\",\n"," \"TTYT\": \"Talk To You Tomorrow\",\
n"," \"IDK\": \"I Don't Know\",\n"," \"TMI\": \"Too Much Information\",\n","
\"IMHO\": \"In My Humble Opinion\",\n"," \"ICYMI\": \"In Case You Missed It\",\
n"," \"AFAIK\": \"As Far As I Know\",\n"," \"BTW\": \"By The Way\",\
n"," \"FAQ\": \"Frequently Asked Questions\",\n"," \"TGIF\": \"Thank God It's
Friday\",\n"," \"FYA\": \"For Your Action\",\n"," \"ICYMI\": \"In Case You
Missed It\",\n","}\n"]},{"cell_type":"code","execution_count":38,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:37:18.994223Z","iopub.status.busy":"2021-12-
31T06:37:18.993886Z","iopub.status.idle":"2021-12-
31T06:37:19.000183Z","shell.execute_reply":"2021-12-
31T06:37:18.999265Z","shell.execute_reply.started":"2021-12-
31T06:37:18.994189Z"},"id":"OLgR54U8fXJN","executionInfo":
{"status":"ok","timestamp":1702117994388,"user_tz":-360,"elapsed":876,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["def chat_conversion(text):\n"," new_text = []\n"," for w in
text.split():\n"," if w.upper() in chat_words:\n","
new_text.append(chat_words[w.upper()])\n"," else:\n","
new_text.append(w)\n"," return \" \".join(new_text)"]},
{"cell_type":"code","execution_count":39,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:37:24.999985Z","iopub.status.busy":"2021-12-
31T06:37:24.999695Z","iopub.status.idle":"2021-12-
31T06:37:25.024673Z","shell.execute_reply":"2021-12-
31T06:37:25.023427Z","shell.execute_reply.started":"2021-12-
31T06:37:24.999952Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"sXQpRJjufXJN","executionInfo":
{"status":"ok","timestamp":1702117998941,"user_tz":-360,"elapsed":852,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"8aa3fc9a-7cb7-4962-edd5-
99d27b71df2f"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'Do this work As Soon As
Possible'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":39}],"source":
["chat_conversion('Do this work ASAP')"]},{"cell_type":"markdown","metadata":
{"id":"8eP5rC1dfXJN"},"source":["# incorrect_text handling"]},
{"cell_type":"code","execution_count":41,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:37:38.717551Z","iopub.status.busy":"2021-12-
31T06:37:38.717223Z","iopub.status.idle":"2021-12-
31T06:37:40.330551Z","shell.execute_reply":"2021-12-
31T06:37:40.329692Z","shell.execute_reply.started":"2021-12-
31T06:37:38.717517Z"},"id":"tvF49XU6fXJO","executionInfo":
{"status":"ok","timestamp":1702118134196,"user_tz":-360,"elapsed":2692,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["from textblob import TextBlob"]},
{"cell_type":"code","execution_count":42,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:37:50.015862Z","iopub.status.busy":"2021-12-
31T06:37:50.015545Z","iopub.status.idle":"2021-12-
31T06:37:50.104071Z","shell.execute_reply":"2021-12-
31T06:37:50.103445Z","shell.execute_reply.started":"2021-12-
31T06:37:50.015829Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"ytlFIBi5fXJO","executionInfo":
{"status":"ok","timestamp":1702118160445,"user_tz":-360,"elapsed":840,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"f86d439b-dd98-4500-e091-
6645dac95ab1"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'certain
conditions during several generations are modified in the same
manner.'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":42}],"source":["incorrect_text =
'ceertain conditionas duriing seveal ggenerations aree moodified in the saame
maner.'\n","\n","textBlb = TextBlob(incorrect_text)\n","\
n","textBlb.correct().string"]},{"cell_type":"markdown","metadata":
{"id":"WITVhvMnfXJO"},"source":["# stopwords"]},
{"cell_type":"code","execution_count":43,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:38:05.959259Z","iopub.status.busy":"2021-12-
31T06:38:05.958488Z","iopub.status.idle":"2021-12-
31T06:38:05.963636Z","shell.execute_reply":"2021-12-
31T06:38:05.962600Z","shell.execute_reply.started":"2021-12-
31T06:38:05.959220Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"n-
2O4IPafXJO","executionInfo":{"status":"ok","timestamp":1702118197970,"user_tz":-
360,"elapsed":834,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"866389ed-6303-480f-9590-
8f0ad571f79c"},"outputs":[{"output_type":"stream","name":"stderr","text":
["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data]
Unzipping corpora/stopwords.zip.\n"]},{"output_type":"execute_result","data":
{"text/plain":["True"]},"metadata":{},"execution_count":43}],"source":["from
nltk.corpus import stopwords\n","import nltk\n","nltk.download('stopwords')"]},
{"cell_type":"code","execution_count":44,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:38:12.662952Z","iopub.status.busy":"2021-12-
31T06:38:12.662315Z","iopub.status.idle":"2021-12-
31T06:38:12.676818Z","shell.execute_reply":"2021-12-
31T06:38:12.675895Z","shell.execute_reply.started":"2021-12-
31T06:38:12.662905Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"5cF8S-
1QfXJP","executionInfo":{"status":"ok","timestamp":1702118201385,"user_tz":-
360,"elapsed":609,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"de4c1692-2716-4ce6-c073-
e237dd36b756"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['i',\n"," 'me',\n"," 'my',\n"," 'myself',\n"," 'we',\n"," 'our',\n"," 'ours',\
n"," 'ourselves',\n"," 'you',\n"," \"you're\",\n"," \"you've\",\n"," \"you'll\",\
n"," \"you'd\",\n"," 'your',\n"," 'yours',\n"," 'yourself',\n"," 'yourselves',\n","
'he',\n"," 'him',\n"," 'his',\n"," 'himself',\n"," 'she',\n"," \"she's\",\n","
'her',\n"," 'hers',\n"," 'herself',\n"," 'it',\n"," \"it's\",\n"," 'its',\n","
'itself',\n"," 'they',\n"," 'them',\n"," 'their',\n"," 'theirs',\n","
'themselves',\n"," 'what',\n"," 'which',\n"," 'who',\n"," 'whom',\n"," 'this',\n","
'that',\n"," \"that'll\",\n"," 'these',\n"," 'those',\n"," 'am',\n"," 'is',\n","
'are',\n"," 'was',\n"," 'were',\n"," 'be',\n"," 'been',\n"," 'being',\n"," 'have',\
n"," 'has',\n"," 'had',\n"," 'having',\n"," 'do',\n"," 'does',\n"," 'did',\n","
'doing',\n"," 'a',\n"," 'an',\n"," 'the',\n"," 'and',\n"," 'but',\n"," 'if',\n","
'or',\n"," 'because',\n"," 'as',\n"," 'until',\n"," 'while',\n"," 'of',\n"," 'at',\
n"," 'by',\n"," 'for',\n"," 'with',\n"," 'about',\n"," 'against',\n"," 'between',\
n"," 'into',\n"," 'through',\n"," 'during',\n"," 'before',\n"," 'after',\n","
'above',\n"," 'below',\n"," 'to',\n"," 'from',\n"," 'up',\n"," 'down',\n"," 'in',\
n"," 'out',\n"," 'on',\n"," 'off',\n"," 'over',\n"," 'under',\n"," 'again',\n","
'further',\n"," 'then',\n"," 'once',\n"," 'here',\n"," 'there',\n"," 'when',\n","
'where',\n"," 'why',\n"," 'how',\n"," 'all',\n"," 'any',\n"," 'both',\n"," 'each',\
n"," 'few',\n"," 'more',\n"," 'most',\n"," 'other',\n"," 'some',\n"," 'such',\n","
'no',\n"," 'nor',\n"," 'not',\n"," 'only',\n"," 'own',\n"," 'same',\n"," 'so',\n","
'than',\n"," 'too',\n"," 'very',\n"," 's',\n"," 't',\n"," 'can',\n"," 'will',\n","
'just',\n"," 'don',\n"," \"don't\",\n"," 'should',\n"," \"should've\",\n"," 'now',\
n"," 'd',\n"," 'll',\n"," 'm',\n"," 'o',\n"," 're',\n"," 've',\n"," 'y',\n","
'ain',\n"," 'aren',\n"," \"aren't\",\n"," 'couldn',\n"," \"couldn't\",\n","
'didn',\n"," \"didn't\",\n"," 'doesn',\n"," \"doesn't\",\n"," 'hadn',\
n"," \"hadn't\",\n"," 'hasn',\n"," \"hasn't\",\n"," 'haven',\n"," \"haven't\",\n","
'isn',\n"," \"isn't\",\n"," 'ma',\n"," 'mightn',\n"," \"mightn't\",\n"," 'mustn',\
n"," \"mustn't\",\n"," 'needn',\n"," \"needn't\",\n"," 'shan',\n"," \"shan't\",\
n"," 'shouldn',\n"," \"shouldn't\",\n"," 'wasn',\n"," \"wasn't\",\n"," 'weren',\
n"," \"weren't\",\n"," 'won',\n"," \"won't\",\n"," 'wouldn',\
n"," \"wouldn't\"]"]},"metadata":{},"execution_count":44}],"source":
["stopwords.words('english')"]},{"cell_type":"code","source":
["len(stopwords.words('english'))"],"metadata":{"colab":{"base_uri":"https://
localhost:8080/"},"id":"h_9njCnCqeW7","executionInfo":
{"status":"ok","timestamp":1702118438447,"user_tz":-360,"elapsed":627,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"73ecb21f-dcf4-47fc-e5e1-
a64711a3814b"},"execution_count":45,"outputs":
[{"output_type":"execute_result","data":{"text/plain":["179"]},"metadata":
{},"execution_count":45}]},{"cell_type":"code","execution_count":46,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:38:56.165757Z","iopub.status.busy":"2021-12-
31T06:38:56.165121Z","iopub.status.idle":"2021-12-
31T06:38:56.171381Z","shell.execute_reply":"2021-12-
31T06:38:56.170444Z","shell.execute_reply.started":"2021-12-
31T06:38:56.165715Z"},"id":"C6FQ_crxfXJP","executionInfo":
{"status":"ok","timestamp":1702118462159,"user_tz":-360,"elapsed":594,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["def remove_stopwords(text):\n"," new_text = []\n","\n"," for
word in text.split():\n"," if word in stopwords.words('english'):\n","
new_text.append('')\n"," else:\n"," new_text.append(word)\n","
x = new_text[:]\n"," new_text.clear()\n"," return \" \".join(x)"]},
{"cell_type":"code","execution_count":47,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:39:01.303753Z","iopub.status.busy":"2021-12-
31T06:39:01.302916Z","iopub.status.idle":"2021-12-
31T06:39:01.315515Z","shell.execute_reply":"2021-12-
31T06:39:01.314610Z","shell.execute_reply.started":"2021-12-
31T06:39:01.303713Z"},"colab":{"base_uri":"https://
localhost:8080/","height":52},"id":"6ToJ1uqBfXJP","executionInfo":
{"status":"ok","timestamp":1702118468950,"user_tz":-360,"elapsed":875,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"1c6f5c9c-43bc-4ff9-d96c-
48fccd7ad746"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'probably all-time favorite movie, story selflessness, sacrifice dedication
noble cause, preachy boring. never gets old, despite seen 15
times'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":47}],"source":
["remove_stopwords('probably my all-time favorite movie, a story of selflessness,
sacrifice and dedication to a noble cause, but it\\'s not preachy or boring. it
just never gets old, despite my having seen it some 15 or more times')"]},
{"cell_type":"code","execution_count":48,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:39:05.677615Z","iopub.status.busy":"2021-12-
31T06:39:05.677024Z","iopub.status.idle":"2021-12-
31T06:39:05.688124Z","shell.execute_reply":"2021-12-
31T06:39:05.687181Z","shell.execute_reply.started":"2021-12-
31T06:39:05.677575Z"},"colab":{"base_uri":"https://
localhost:8080/","height":206},"id":"Sv9S5wuGfXJQ","executionInfo":
{"status":"ok","timestamp":1702118481663,"user_tz":-360,"elapsed":1104,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"79adfc6c-ae9f-42f4-a2b5-
ffa19f8aeeb8"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["
review sentiment\n","0 one of the other reviewers has mentioned that ...
positive\n","1 a wonderful little production. the filming tec... positive\n","2
i thought this was a wonderful way to spend ti... positive\n","3 basically
there's a family where a little boy ... negative\n","4 petter mattei's \"love in
the time of money\" is... positive"],"text/html":["\n"," <div id=\"df-30935505-
f8cf-40de-91ca-187cf4e44e0e\" class=\"colab-df-container\">\n"," <div>\
n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n","
vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n","
vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-
align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\
n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n","
<th>review</th>\n"," <th>sentiment</th>\n"," </tr>\n"," </thead>\n","
<tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>one of the other reviewers
has mentioned that ...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\
n"," <th>1</th>\n"," <td>a wonderful little production. the filming
tec...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\n","
<th>2</th>\n"," <td>i thought this was a wonderful way to spend ti...</td>\
n"," <td>positive</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n","
<td>basically there's a family where a little boy ...</td>\n","
<td>negative</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n","
<td>petter mattei's \"love in the time of money\" is...</td>\n","
<td>positive</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div
class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n","
<button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-30935505-
f8cf-40de-91ca-187cf4e44e0e')\"\n"," title=\"Convert this dataframe to
an interactive table.\"\n","
style=\"display:none;\">\n","\n"," <svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\
n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-
160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-
160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\
n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n","
display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n","
background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\
n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n","
height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\
n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n","
box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67,
0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n","
margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n","
background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n","
[theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n","
box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px
1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\
n","\n"," <script>\n"," const buttonEl =\n","
document.querySelector('#df-30935505-f8cf-40de-91ca-187cf4e44e0e button.colab-df-
convert');\n"," buttonEl.style.display =\n","
google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function
convertToInteractive(key) {\n"," const element =
document.querySelector('#df-30935505-f8cf-40de-91ca-187cf4e44e0e');\n","
const dataTable =\n"," await
google.colab.kernel.invokeFunction('convertToInteractive',\n","
[key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml
= 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n"," + ' to learn more about interactive tables.';\n","
element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n","
await google.colab.output.renderOutput(dataTable, element);\n"," const
docLink = document.createElement('div');\n"," docLink.innerHTML =
docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n","
</script>\n"," </div>\n","\n","\n","<div id=\"df-8a4d5c69-09b4-47cb-a1b3-
5d80e07d8ad6\">\n"," <button class=\"colab-df-quickchart\"
onclick=\"quickchart('df-8a4d5c69-09b4-47cb-a1b3-5d80e07d8ad6')\"\n","
title=\"Suggest charts\"\n"," style=\"display:none;\">\n","\n","<svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","
width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0
1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-
2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\
n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color:
#1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color:
#174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color:
#DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color:
#3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n","
--hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --
disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n","
background-color: var(--bg-color);\n"," border: none;\n"," border-radius:
50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-
color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\
n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-
color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60,
64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\
n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-
complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n","
fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\
n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n","
border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n","
animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin
{\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color:
var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n","
20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-
color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n","
border-color: transparent;\n"," border-left-color: var(--fill-color);\n","
border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-
color);\n"," }\n"," 40% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-
color);\n"," }\n"," 60% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-
color: transparent;\n"," border-right-color: var(--fill-color);\n","
border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-
color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\
n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\
n"," const quickchartButtonEl =\n"," document.querySelector('#' + key +
' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n","
try {\n"," const charts = await google.colab.kernel.invokeFunction(\n","
'suggestCharts', [key], {});\n"," } catch (error) {\n","
console.error('Error during call to suggestCharts:', error);\n"," }\n","
quickchartButtonEl.classList.remove('colab-df-spinner');\n","
quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n","
(() => {\n"," let quickchartButtonEl =\n","
document.querySelector('#df-8a4d5c69-09b4-47cb-a1b3-5d80e07d8ad6 button');\n","
quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ?
'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n","
</div>\n"]},"metadata":{},"execution_count":48}],"source":["df.head()"]},
{"cell_type":"code","execution_count":49,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:39:09.921822Z","iopub.status.busy":"2021-12-
31T06:39:09.921540Z","iopub.status.idle":"2021-12-
31T06:42:18.536775Z","shell.execute_reply":"2021-12-
31T06:42:18.535137Z","shell.execute_reply.started":"2021-12-
31T06:39:09.921791Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"7TT5T2iifXJR","executionInfo":
{"status":"ok","timestamp":1702118488630,"user_tz":-360,"elapsed":4439,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"14c990ea-1959-4f78-e9f6-
a9b55aa196b6"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["0
one reviewers mentioned watching 1 oz e...\n","1 wonderful little
production. filming techniq...\n","2 thought wonderful way spend time
hot s...\n","3 basically there's family little boy (jake) ...\n","4
petter mattei's \"love time money\" visuall...\n","
... \n","95 daniel day-lewis versatile actor alive.
eng...\n","96 guess would originally going least two ...\n","97 well,
like watch bad horror b-movies, cause ...\n","98 worst movie ever seen,
well as, worst ...\n","99 mario fan long remember, fond memo...\
n","Name: review, Length: 100, dtype: object"]},"metadata":
{},"execution_count":49}],"source":["df['review'].apply(remove_stopwords)"]},
{"cell_type":"markdown","metadata":{"id":"U1BpIUyGfXJS"},"source":["# remove_emoji
handle"]},{"cell_type":"code","execution_count":50,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:42:51.745358Z","iopub.status.busy":"2021-12-
31T06:42:51.744505Z","iopub.status.idle":"2021-12-
31T06:42:51.750288Z","shell.execute_reply":"2021-12-
31T06:42:51.749730Z","shell.execute_reply.started":"2021-12-
31T06:42:51.745310Z"},"id":"G1Flwi-jfXJS","executionInfo":
{"status":"ok","timestamp":1702118593135,"user_tz":-360,"elapsed":609,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["import re\n","def remove_emoji(text):\n"," emoji_pattern =
re.compile(\"[\"\n"," u\"\\U0001F600-\\U0001F64F\" #
emoticons\n"," u\"\\U0001F300-\\U0001F5FF\" # symbols &
pictographs\n"," u\"\\U0001F680-\\U0001F6FF\" #
transport & map symbols\n"," u\"\\U0001F1E0-\\U0001F1FF\"
# flags (iOS)\n"," u\"\\U00002702-\\U000027B0\"\n","
u\"\\U000024C2-\\U0001F251\"\n"," \"]+\",
flags=re.UNICODE)\n"," return emoji_pattern.sub(r'', text)"]},
{"cell_type":"code","execution_count":51,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:42:54.927028Z","iopub.status.busy":"2021-12-
31T06:42:54.926479Z","iopub.status.idle":"2021-12-
31T06:42:54.937617Z","shell.execute_reply":"2021-12-
31T06:42:54.937038Z","shell.execute_reply.started":"2021-12-
31T06:42:54.926979Z"},"co
lab":{"base_uri":"https://localhost:8080/","height":35},"id":"ytld3e-
afXJT","executionInfo":{"status":"ok","timestamp":1702118604549,"user_tz":-
360,"elapsed":2045,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"845cf3fb-c7b9-4910-8dba-
c391a52cbb80"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'Loved the movie. It was
'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":51}],"source":
["remove_emoji(\"Loved the movie. It was 😘😘\")"]},
{"cell_type":"code","execution_count":52,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:42:57.500459Z","iopub.status.busy":"2021-12-
31T06:42:57.499747Z","iopub.status.idle":"2021-12-
31T06:42:57.505979Z","shell.execute_reply":"2021-12-
31T06:42:57.505427Z","shell.execute_reply.started":"2021-12-
31T06:42:57.500400Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"tWwFcSKYfXJT","executionInfo":
{"status":"ok","timestamp":1702118609581,"user_tz":-360,"elapsed":29,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"912f211f-7a8a-4f51-e6e2-
3d86d8f88b15"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'Lmao '"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":52}],"source":
["remove_emoji(\"Lmao 😂😂\")"]},{"cell_type":"code","source":["!pip install
emoji"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2kaMJEYFnG
4s","executionInfo":{"status":"ok","timestamp":1702118678831,"user_tz":-
360,"elapsed":9961,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"559fa0e6-f7a4-4d0a-8ba7-
24603e4014e6"},"execution_count":53,"outputs":
[{"output_type":"stream","name":"stdout","text":["Collecting emoji\n","
Downloading emoji-2.9.0-py2.py3-none-any.whl (397 kB)\n","\u001b[2K \
u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m397.5/397.5
kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?
25hInstalling collected packages: emoji\n","Successfully installed emoji-2.9.0\
n"]}]},{"cell_type":"code","execution_count":54,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:04.191966Z","iopub.status.busy":"2021-12-
31T06:43:04.191476Z","iopub.status.idle":"2021-12-
31T06:43:04.480988Z","shell.execute_reply":"2021-12-
31T06:43:04.480125Z","shell.execute_reply.started":"2021-12-
31T06:43:04.191935Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"N7KbcmehfXJU","executionInfo":
{"status":"ok","timestamp":1702118690509,"user_tz":-360,"elapsed":849,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"d621aba2-2706-4cf1-a126-
a70d06126506"},"outputs":[{"output_type":"stream","name":"stdout","text":["Python
is :fire:\n"]}],"source":["import emoji\n","print(emoji.demojize('Python is
🔥'))"]},{"cell_type":"code","execution_count":55,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:07.217770Z","iopub.status.busy":"2021-12-
31T06:43:07.217301Z","iopub.status.idle":"2021-12-
31T06:43:07.222526Z","shell.execute_reply":"2021-12-
31T06:43:07.221705Z","shell.execute_reply.started":"2021-12-
31T06:43:07.217723Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"9UuuiuetfXJU","executionInfo":
{"status":"ok","timestamp":1702118701342,"user_tz":-360,"elapsed":921,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"cbe7769c-7d2a-4317-b5f8-
ec0383bbe61f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Loved
the movie. It was :face_blowing_a_kiss:\n"]}],"source":
["print(emoji.demojize('Loved the movie. It was 😘'))"]},
{"cell_type":"markdown","metadata":{"id":"L68UBnjvfXJV"},"source":["#
Tokenization"]},{"cell_type":"markdown","metadata":{"id":"s4DRrhBifXJV"},"source":
["### 1. Using the split function"]},
{"cell_type":"code","execution_count":56,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:15.810492Z","iopub.status.busy":"2021-12-
31T06:43:15.810189Z","iopub.status.idle":"2021-12-
31T06:43:15.816173Z","shell.execute_reply":"2021-12-
31T06:43:15.815384Z","shell.execute_reply.started":"2021-12-
31T06:43:15.810462Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"ixnRRzv1fXJV","executionInfo":
{"status":"ok","timestamp":1702118799113,"user_tz":-360,"elapsed":637,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"2a4f405c-64ed-4688-ba8a-
f88a145eeef5"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['I', 'am', 'going', 'to', 'delhi']"]},"metadata":
{},"execution_count":56}],"source":["# word tokenization\n","sent1 = 'I am going to
delhi'\n","sent1.split()"]},{"cell_type":"code","execution_count":57,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:43:16.672825Z","iopub.status.busy":"2021-12-
31T06:43:16.672535Z","iopub.status.idle":"2021-12-
31T06:43:16.679365Z","shell.execute_reply":"2021-12-
31T06:43:16.678423Z","shell.execute_reply.started":"2021-12-
31T06:43:16.672793Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"Uq8GvD4lfXJW","executionInfo":
{"status":"ok","timestamp":1702118821622,"user_tz":-360,"elapsed":874,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"655eea9a-7431-4a25-997a-
3a4b0cf52227"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['I am going to delhi',\n"," ' I will stay there for 3 days',\n"," \" Let's hope
the trip to be great\"]"]},"metadata":{},"execution_count":57}],"source":["#
sentence tokenization\n","sent2 = 'I am going to delhi. I will stay there for 3
days. Let\\'s hope the trip to be great'\n","sent2.split('.')"]},
{"cell_type":"code","execution_count":58,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:19.371045Z","iopub.status.busy":"2021-12-
31T06:43:19.370499Z","iopub.status.idle":"2021-12-
31T06:43:19.376805Z","shell.execute_reply":"2021-12-
31T06:43:19.376060Z","shell.execute_reply.started":"2021-12-
31T06:43:19.371000Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"NuUmlRlqfXJW","executionInfo":
{"status":"ok","timestamp":1702118917299,"user_tz":-360,"elapsed":591,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"c8858d81-b372-4f3b-86e7-
ea3298f3e425"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['I', 'am', 'going', 'to', 'delhi!']"]},"metadata":
{},"execution_count":58}],"source":["# Problems with split function\n","sent3 = 'I
am going to delhi!'\n","sent3.split()"]},
{"cell_type":"code","execution_count":59,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:22.987186Z","iopub.status.busy":"2021-12-
31T06:43:22.986910Z","iopub.status.idle":"2021-12-
31T06:43:22.993658Z","shell.execute_reply":"2021-12-
31T06:43:22.992820Z","shell.execute_reply.started":"2021-12-
31T06:43:22.987156Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"QMTgzvsCfXJX","executionInfo":
{"status":"ok","timestamp":1702118929249,"user_tz":-360,"elapsed":1256,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"8fd5f928-a042-433d-ddba-
c03d1e6fe0e4"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['Where do think I should go? I have 3 day holiday']"]},"metadata":
{},"execution_count":59}],"source":["sent4 = 'Where do think I should go? I have 3
day holiday'\n","sent4.split('.')"]},{"cell_type":"markdown","metadata":
{"id":"YyvxOXehfXJX"},"source":["### 2. Regular Expression"]},
{"cell_type":"code","execution_count":60,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:25.066222Z","iopub.status.busy":"2021-12-
31T06:43:25.065942Z","iopub.status.idle":"2021-12-
31T06:43:25.072526Z","shell.execute_reply":"2021-12-
31T06:43:25.071800Z","shell.execute_reply.started":"2021-12-
31T06:43:25.066190Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"FpUVu6qffXJY","executionInfo":
{"status":"ok","timestamp":1702118940954,"user_tz":-360,"elapsed":1057,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"7246b2a6-60b3-4bf8-fd04-
60c2d7015497"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['I', 'am', 'going', 'to', 'delhi']"]},"metadata":
{},"execution_count":60}],"source":["import re\n","sent3 = 'I am going to delhi!'\
n","tokens = re.findall(\"[\\w']+\", sent3)\n","tokens"]},
{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:36.071658Z","iopub.status.busy":"2021-12-
31T06:43:36.071360Z","iopub.status.idle":"2021-12-
31T06:43:36.079075Z","shell.execute_reply":"2021-12-
31T06:43:36.078094Z","shell.execute_reply.started":"2021-12-
31T06:43:36.071624Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"xRQv_HBafXJZ","executionInfo":
{"status":"ok","timestamp":1694930027431,"user_tz":-360,"elapsed":723,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"21b5446e-845d-4219-8e5a-
6ed69d8e421f"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['Lorem Ipsum is simply dummy text of the printing and typesetting industry',\
n"," \"\\nLorem Ipsum has been the industry's standard dummy text ever since the
1500s, \\nwhen an unknown printer took a galley of type and scrambled it to make a
type specimen book.\"]"]},"metadata":{},"execution_count":67}],"source":["\n","text
= \"\"\"Lorem Ipsum is simply dummy text of the printing and typesetting industry?\
n","Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\
n","when an unknown printer took a galley of type and scrambled it to make a type
specimen book.\"\"\"\n","sentences = re.compile('[.!?] ').split(text)\
n","sentences"]},{"cell_type":"markdown","metadata":{"id":"E0WfA5rxfXJZ"},"source":
["### 3. NLTK"]},{"cell_type":"code","execution_count":61,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:43:43.710755Z","iopub.status.busy":"2021-12-
31T06:43:43.710469Z","iopub.status.idle":"202
1-12-31T06:43:43.715818Z","shell.execute_reply":"2021-12-
31T06:43:43.715109Z","shell.execute_reply.started":"2021-12-
31T06:43:43.710728Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"7Xm_QZ8BfXJZ","executionInfo":
{"status":"ok","timestamp":1702118958212,"user_tz":-360,"elapsed":1011,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"71b6e3fd-a1eb-4a12-c658-
fc278fb4865f"},"outputs":[{"output_type":"stream","name":"stderr","text":
["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data]
Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":
{"text/plain":["True"]},"metadata":{},"execution_count":61}],"source":["from
nltk.tokenize import word_tokenize,sent_tokenize\n","import nltk\
n","nltk.download('punkt')"]},{"cell_type":"code","execution_count":62,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
31T06:43:44.073430Z","iopub.status.busy":"2021-12-
31T06:43:44.072769Z","iopub.status.idle":"2021-12-
31T06:43:44.093236Z","shell.execute_reply":"2021-12-
31T06:43:44.092271Z","shell.execute_reply.started":"2021-12-
31T06:43:44.073393Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"-
abmrHeNfXJa","executionInfo":{"status":"ok","timestamp":1702118969041,"user_tz":-
360,"elapsed":847,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"3d1eb594-0ceb-44db-f4d6-
05b3f4f136aa"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['I', 'am', 'going', 'to', 'visit', 'delhi', '!']"]},"metadata":
{},"execution_count":62}],"source":["sent1 = 'I am going to visit delhi!'\
n","word_tokenize(sent1)"]},{"cell_type":"code","execution_count":63,"metadata":
{"execution":{"iopub.execute_input":"2021-12-
10T06:03:00.917544Z","iopub.status.busy":"2021-12-
10T06:03:00.915226Z","iopub.status.idle":"2021-12-
10T06:03:00.924761Z","shell.execute_reply":"2021-12-
10T06:03:00.923887Z","shell.execute_reply.started":"2021-12-
10T06:03:00.917502Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"g9xpqy5rfXJa","executionInfo":
{"status":"ok","timestamp":1702118973004,"user_tz":-360,"elapsed":611,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"564317bf-d40a-4b18-a15c-
e2c5adc995cf"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',\
n"," \"Lorem Ipsum has been the industry's standard dummy text ever since the
1500s,\\nwhen an unknown printer took a galley of type and scrambled it to make a
type specimen book.\"]"]},"metadata":{},"execution_count":63}],"source":["text
= \"\"\"Lorem Ipsum is simply dummy text of the printing and typesetting industry?\
n","Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\
n","when an unknown printer took a galley of type and scrambled it to make a type
specimen book.\"\"\"\n","\n","sent_tokenize(text)"]},
{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:44:20.220975Z","iopub.status.busy":"2021-12-
31T06:44:20.220662Z","iopub.status.idle":"2021-12-
31T06:44:20.227882Z","shell.execute_reply":"2021-12-
31T06:44:20.227311Z","shell.execute_reply.started":"2021-12-
31T06:44:20.220945Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"WPcDxHj1fXJa","executionInfo":
{"status":"ok","timestamp":1694930117285,"user_tz":-360,"elapsed":719,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"efaf0458-40e6-4d63-d490-
a193122e353b"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['I', 'have', 'a', 'Ph.D', 'in', 'A.I']"]},"metadata":
{},"execution_count":71}],"source":["sent5 = 'I have a Ph.D in A.I'\n","sent6
= \"We're here to help! mail us at [email protected]\"\n","sent7 = 'A 5km ride cost
$10.50'\n","\n","word_tokenize(sent5)"]},
{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:44:25.644168Z","iopub.status.busy":"2021-12-
31T06:44:25.643643Z","iopub.status.idle":"2021-12-
31T06:44:25.651199Z","shell.execute_reply":"2021-12-
31T06:44:25.650604Z","shell.execute_reply.started":"2021-12-
31T06:44:25.644113Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"7DPWnjOJfXJa","executionInfo":
{"status":"ok","timestamp":1694930122349,"user_tz":-360,"elapsed":968,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"781cdccf-a473-4b96-fae4-
a705d5a524da"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['We',\n"," \"'re\",\n"," 'here',\n"," 'to',\n"," 'help',\n"," '!',\n"," 'mail',\
n"," 'us',\n"," 'at',\n"," 'nks',\n"," '@',\n"," 'gmail.com']"]},"metadata":
{},"execution_count":72}],"source":["word_tokenize(sent6)"]},
{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:44:28.249968Z","iopub.status.busy":"2021-12-
31T06:44:28.249507Z","iopub.status.idle":"2021-12-
31T06:44:28.255852Z","shell.execute_reply":"2021-12-
31T06:44:28.254914Z","shell.execute_reply.started":"2021-12-
31T06:44:28.249918Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"4fP4Af93fXJb","executionInfo":
{"status":"ok","timestamp":1694930127142,"user_tz":-360,"elapsed":1049,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"9dfcbcad-1444-47ee-81c9-
b1085d87b7e5"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["['A', '5km', 'ride', 'cost', '$', '10.50']"]},"metadata":
{},"execution_count":73}],"source":["word_tokenize(sent7)"]},
{"cell_type":"markdown","metadata":{"id":"zGCXaozqfXJb"},"source":["### 4. Spacy
(good)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:44:30.428238Z","iopub.status.busy":"2021-12-
31T06:44:30.427753Z","iopub.status.idle":"2021-12-
31T06:44:40.692846Z","shell.execute_reply":"2021-12-
31T06:44:40.692211Z","shell.execute_reply.started":"2021-12-
31T06:44:30.428189Z"},"id":"SwVDcE4mfXJb"},"outputs":[],"source":["import spacy\
n","nlp = spacy.load('en_core_web_sm')"]},
{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:44:40.694652Z","iopub.status.busy":"2021-12-
31T06:44:40.694267Z","iopub.status.idle":"2021-12-
31T06:44:40.739047Z","shell.execute_reply":"2021-12-
31T06:44:40.738463Z","shell.execute_reply.started":"2021-12-
31T06:44:40.694605Z"},"id":"hT14PkK_fXJb"},"outputs":[],"source":["doc1 =
nlp(sent5)\n","doc2 = nlp(sent6)\n","doc3 = nlp(sent7)\n","doc4 = nlp(sent1)"]},
{"cell_type":"code","source":["doc4 = nlp(sent1)\n","doc4"],"metadata":{"colab":
{"base_uri":"https://localhost:8080/"},"id":"IqiMgoh02hVU","executionInfo":
{"status":"ok","timestamp":1694930168336,"user_tz":-360,"elapsed":1106,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"af862652-7230-445e-82da-
1442f75738f6"},"execution_count":null,"outputs":
[{"output_type":"execute_result","data":{"text/plain":["I am going to visit
delhi!"]},"metadata":{},"execution_count":76}]},
{"cell_type":"code","execution_count":null,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:44:55.784086Z","iopub.status.busy":"2021-12-
31T06:44:55.783589Z","iopub.status.idle":"2021-12-
31T06:44:55.790345Z","shell.execute_reply":"2021-12-
31T06:44:55.789180Z","shell.execute_reply.started":"2021-12-
31T06:44:55.784037Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"99vrOvtQfXJc","executionInfo":
{"status":"ok","timestamp":1694930172591,"user_tz":-360,"elapsed":964,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"5bb36b67-6783-4847-cf62-
93075bf30918"},"outputs":[{"output_type":"stream","name":"stdout","text":["I\
n","am\n","going\n","to\n","visit\n","delhi\n","!\n"]}],"source":["for token in
doc4:\n"," print(token)"]},{"cell_type":"code","source":
["df.head()"],"metadata":{"colab":{"base_uri":"https://
localhost:8080/","height":206},"id":"3uhpdsyI2pac","executionInfo":
{"status":"ok","timestamp":1694930176672,"user_tz":-360,"elapsed":768,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"dd74ed2f-f487-4b93-a7ca-
3882198f6e0e"},"execution_count":null,"outputs":
[{"output_type":"execute_result","data":{"text/plain":["
review sentiment\n","0 one of the other reviewers has mentioned that ...
positive\n","1 a wonderful little production. the filming tec... positive\n","2
i thought this was a wonderful way to spend ti... positive\n","3 basically
there's a family where a little boy ... negative\n","4 petter mattei's \"love in
the time of money\" is... positive"],"text/html":["\n"," <div id=\"df-d2a3960e-
6709-4c13-8048-9e3410e4ed3d\" class=\"colab-df-container\">\n"," <div>\
n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n","
vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n","
vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-
align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\
n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n","
<th>review</th>\n"," <th>sentiment</th>\n"," </tr>\n"," </thead>\n","
<tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>one of the other reviewers
has mentioned that ...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\
n"," <th>1</th>\n"," <td>a wonderful little production. the filming
tec...</td>\n"," <td>positive</td>\n"," </tr>\n"," <tr>\n","
<th>2</th>\n"," <td>i thought this was a wonderful way to spend ti...</td>\
n"," <td>positive</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n","
<td>basically there's a family where a little boy ...</td>\n","
<td>negative</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n","
<td>petter mattei's \"love in the time of money\" is...</td>\n","
<td>positive</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div
class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n","
<button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d2a3960e-
6709-4c13-8048-9e3410e4ed3d')\"\n"," title=\"Convert this dataframe to
an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\
n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-
160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-
160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\
n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n","
display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n","
background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\
n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n","
height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\
n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n","
box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67,
0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n","
margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n","
background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n","
[theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n","
box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px
1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\
n","\n"," <script>\n"," const buttonEl =\n","
document.querySelector('#df-d2a3960e-6709-4c13-8048-9e3410e4ed3d button.colab-df-
convert');\n"," buttonEl.style.display =\n","
google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function
convertToInteractive(key) {\n"," const element =
document.querySelector('#df-d2a3960e-6709-4c13-8048-9e3410e4ed3d');\n","
const dataTable =\n"," await
google.colab.kernel.invokeFunction('convertToInteractive',\n","
[key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml
= 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\"
href=https://colab.research.google.com/notebooks/data_table.ipynb>data table
notebook</a>'\n"," + ' to learn more about interactive tables.';\n","
element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n","
await google.colab.output.renderOutput(dataTable, element);\n"," const
docLink = document.createElement('div');\n"," docLink.innerHTML =
docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n","
</script>\n"," </div>\n","\n","\n","<div id=\"df-c8be058b-b4e7-4936-ad31-
679859c35d7d\">\n"," <button class=\"colab-df-quickchart\"
onclick=\"quickchart('df-c8be058b-b4e7-4936-ad31-679859c35d7d')\"\n","
title=\"Suggest charts.\"\n"," style=\"display:none;\">\n","\n","<svg
xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","
width=\"24px\">\n"," <g>\n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0
1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-
2v-4h2v4z\"/>\n"," </g>\n","</svg>\n"," </button>\n","\n","<style>\
n"," .colab-df-quickchart {\n"," --bg-color: #E8F0FE;\n"," --fill-color:
#1967D2;\n"," --hover-bg-color: #E2EBFA;\n"," --hover-fill-color:
#174EA6;\n"," --disabled-fill-color: #AAA;\n"," --disabled-bg-color:
#DDD;\n"," }\n","\n"," [theme=dark] .colab-df-quickchart {\n"," --bg-color:
#3B4455;\n"," --fill-color: #D2E3FC;\n"," --hover-bg-color: #434B5C;\n","
--hover-fill-color: #FFFFFF;\n"," --disabled-bg-color: #3B4455;\n"," --
disabled-fill-color: #666;\n"," }\n","\n"," .colab-df-quickchart {\n","
background-color: var(--bg-color);\n"," border: none;\n"," border-radius:
50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: var(--fill-
color);\n"," height: 32px;\n"," padding: 0;\n"," width: 32px;\n"," }\
n","\n"," .colab-df-quickchart:hover {\n"," background-color: var(--hover-bg-
color);\n"," box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60,
64, 67, 0.15);\n"," fill: var(--button-hover-fill-color);\n"," }\n","\
n"," .colab-df-quickchart-complete:disabled,\n"," .colab-df-quickchart-
complete:disabled:hover {\n"," background-color: var(--disabled-bg-color);\n","
fill: var(--disabled-fill-color);\n"," box-shadow: none;\n"," }\n","\
n"," .colab-df-spinner {\n"," border: 2px solid var(--fill-color);\n","
border-color: transparent;\n"," border-bottom-color: var(--fill-color);\n","
animation:\n"," spin 1s steps(1) infinite;\n"," }\n","\n"," @keyframes spin
{\n"," 0% {\n"," border-color: transparent;\n"," border-bottom-color:
var(--fill-color);\n"," border-left-color: var(--fill-color);\n"," }\n","
20% {\n"," border-color: transparent;\n"," border-left-color: var(--fill-
color);\n"," border-top-color: var(--fill-color);\n"," }\n"," 30% {\n","
border-color: transparent;\n"," border-left-color: var(--fill-color);\n","
border-top-color: var(--fill-color);\n"," border-right-color: var(--fill-
color);\n"," }\n"," 40% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," border-top-color: var(--fill-
color);\n"," }\n"," 60% {\n"," border-color: transparent;\n","
border-right-color: var(--fill-color);\n"," }\n"," 80% {\n"," border-
color: transparent;\n"," border-right-color: var(--fill-color);\n","
border-bottom-color: var(--fill-color);\n"," }\n"," 90% {\n"," border-
color: transparent;\n"," border-bottom-color: var(--fill-color);\n"," }\
n"," }\n","</style>\n","\n"," <script>\n"," async function quickchart(key) {\
n"," const quickchartButtonEl =\n"," document.querySelector('#' + key +
' button');\n"," quickchartButtonEl.disabled = true; // To prevent multiple
clicks.\n"," quickchartButtonEl.classList.add('colab-df-spinner');\n","
try {\n"," const charts = await google.colab.kernel.invokeFunction(\n","
'suggestCharts', [key], {});\n"," } catch (error) {\n","
console.error('Error during call to suggestCharts:', error);\n"," }\n","
quickchartButtonEl.classList.remove('colab-df-spinner');\n","
quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n"," }\n","
(() => {\n"," let quickchartButtonEl =\n","
document.querySelector('#df-c8be058b-b4e7-4936-ad31-679859c35d7d button');\n","
quickchartButtonEl.style.display =\n"," google.colab.kernel.accessAllowed ?
'block' : 'none';\n"," })();\n"," </script>\n","</div>\n"," </div>\n","
</div>\n"]},"metadata":{},"execution_count":78}]},
{"cell_type":"markdown","metadata":{"id":"3ZO2iHAUfXJc"},"source":["# Stemmer"]},
{"cell_type":"code","execution_count":64,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:45:24.636271Z","iopub.status.busy":"2021-12-
31T06:45:24.635982Z","iopub.status.idle":"2021-12-
31T06:45:24.640222Z","shell.execute_reply":"2021-12-
31T06:45:24.639613Z","shell.execute_reply.started":"2021-12-
31T06:45:24.636238Z"},"id":"b9QyBoF7fXJd","executionInfo":
{"status":"ok","timestamp":1702119360869,"user_tz":-360,"elapsed":881,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["from nltk.stem.porter import PorterStemmer"]},
{"cell_type":"code","execution_count":65,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:46:07.014808Z","iopub.status.busy":"2021-12-
31T06:46:07.014418Z","iopub.status.idle":"2021-12-
31T06:46:07.021274Z","shell.execute_reply":"2021-12-
31T06:46:07.020180Z","shell.execute_reply.started":"2021-12-
31T06:46:07.014768Z"},"id":"tvzg9ij1fXJd","executionInfo":
{"status":"ok","timestamp":1702119367109,"user_tz":-360,"elapsed":605,"user":
{"displayName":"colab0 ineuron","userId":"16851312232179065356"}}},"outputs":
[],"source":["ps = PorterStemmer()\n","def stem_words(text):\n","
return \" \".join([ps.stem(word) for word in text.split()])"]},
{"cell_type":"code","execution_count":66,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:46:09.751707Z","iopub.status.busy":"2021-12-
31T06:46:09.751139Z","iopub.status.idle":"2021-12-
31T06:46:09.758027Z","shell.execute_reply":"2021-12-
31T06:46:09.757151Z","shell.execute_reply.started":"2021-12-
31T06:46:09.751645Z"},"colab":{"base_uri":"https://
localhost:8080/","height":35},"id":"4G6Hlif3fXJd","executionInfo":
{"status":"ok","timestamp":1702119373440,"user_tz":-360,"elapsed":587,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"a39ee314-dc57-4429-b084-
b95298f03f30"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'walk walk walk walk'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":66}],"source":["sample = \"walk
walks walking walked\"\n","stem_words(sample)"]},
{"cell_type":"code","execution_count":67,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:46:13.794724Z","iopub.status.busy":"2021-12-
31T06:46:13.794420Z","iopub.status.idle":"2021-12-
31T06:46:13.800289Z","shell.execute_reply":"2021-12-
31T06:46:13.799470Z","shell.execute_reply.started":"2021-12-
31T06:46:13.794690Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"z9yy0koLfXJd","executionInfo":{"status":"ok","time
stamp":1702119380043,"user_tz":-360,"elapsed":829,"user":{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"b3bbe29f-5233-4aeb-cf6f-
eb2b2bc7f4c4"},"outputs":[{"output_type":"stream","name":"stdout","text":["probably
my alltime favorite movie a story of selflessness sacrifice and dedication to a
noble cause but its not preachy or boring it just never gets old despite my having
seen it some 15 or more times in the last 25 years paul lukas performance brings
tears to my eyes and bette davis in one of her very few truly sympathetic roles is
a delight the kids are as grandma says more like dressedup midgets than children
but that only makes them more fun to watch and the mothers slow awakening to whats
happening in the world and under her own roof is believable and startling if i had
a dozen thumbs theyd all be up for this movie\n"]}],"source":["text = 'probably my
alltime favorite movie a story of selflessness sacrifice and dedication to a noble
cause but its not preachy or boring it just never gets old despite my having seen
it some 15 or more times in the last 25 years paul lukas performance brings tears
to my eyes and bette davis in one of her very few truly sympathetic roles is a
delight the kids are as grandma says more like dressedup midgets than children but
that only makes them more fun to watch and the mothers slow awakening to whats
happening in the world and under her own roof is believable and startling if i had
a dozen thumbs theyd all be up for this movie'\n","print(text)"]},
{"cell_type":"code","execution_count":68,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:46:38.928962Z","iopub.status.busy":"2021-12-
31T06:46:38.928309Z","iopub.status.idle":"2021-12-
31T06:46:38.938004Z","shell.execute_reply":"2021-12-
31T06:46:38.937471Z","shell.execute_reply.started":"2021-12-
31T06:46:38.928912Z"},"colab":{"base_uri":"https://
localhost:8080/","height":104},"id":"UO-aJt_cfXJe","executionInfo":
{"status":"ok","timestamp":1702119385634,"user_tz":-360,"elapsed":587,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"4e6f19e3-3cc9-40c1-e599-
e7835adf59a6"},"outputs":[{"output_type":"execute_result","data":{"text/plain":
["'probabl my alltim favorit movi a stori of selfless sacrific and dedic to a nobl
caus but it not preachi or bore it just never get old despit my have seen it some
15 or more time in the last 25 year paul luka perform bring tear to my eye and bett
davi in one of her veri few truli sympathet role is a delight the kid are as
grandma say more like dressedup midget than children but that onli make them more
fun to watch and the mother slow awaken to what happen in the world and under her
own roof is believ and startl if i had a dozen thumb theyd all be up for thi
movi'"],"application/vnd.google.colaboratory.intrinsic+json":
{"type":"string"}},"metadata":{},"execution_count":68}],"source":
["stem_words(text)"]},{"cell_type":"markdown","metadata":
{"id":"40L9UIwhfXJe"},"source":["# Lemmatization"]},
{"cell_type":"code","execution_count":69,"metadata":{"execution":
{"iopub.execute_input":"2021-12-31T06:50:09.585926Z","iopub.status.busy":"2021-12-
31T06:50:09.585607Z","iopub.status.idle":"2021-12-
31T06:50:11.918170Z","shell.execute_reply":"2021-12-
31T06:50:11.917310Z","shell.execute_reply.started":"2021-12-
31T06:50:09.585885Z"},"colab":{"base_uri":"https://
localhost:8080/"},"id":"eAf31FTGfXJe","executionInfo":
{"status":"ok","timestamp":1702119457885,"user_tz":-360,"elapsed":4368,"user":
{"displayName":"colab0
ineuron","userId":"16851312232179065356"}},"outputId":"313e879a-558f-407e-b406-
9ddaf7516803"},"outputs":[{"output_type":"stream","name":"stderr","text":
["[nltk_data] Downloading package wordnet to /root/nltk_data...\n","[nltk_data]
Downloading package omw-1.4 to /root/nltk_data...\n"]},
{"output_type":"stream","name":"stdout","text":["Word Lemma
\n","He He \n","was be
\n","running run \n","and and
\n","eating eat \n","at at
\n","same same \n","time time
\n","He He \n","has have
\n","bad bad \n","habit habit
\n","of of \n","swimming swim
\n","after after \n","playing play
\n","long long \n","hours hours
\n","in in \n","the the
\n","Sun Sun \n"]}],"source":["import nltk\n","from
nltk.stem import WordNetLemmatizer\n","import nltk\n","nltk.download('wordnet')\
n","nltk.download('omw-1.4')\n","wordnet_lemmatizer = WordNetLemmatizer()\n","\
n","sentence = \"He was running and eating at same time. He has bad habit of
swimming after playing long hours in the Sun.\"\n","punctuations=\"?:!.,;\"\
n","sentence_words = nltk.word_tokenize(sentence)\n","for word in sentence_words:\
n"," if word in punctuations:\n"," sentence_words.remove(word)\n","\
n","sentence_words\n","print(\"{0:20}{1:20}\".format(\"Word\",\"Lemma\"))\n","for
word in sentence_words:\n"," print (\"{0:20}
{1:20}\".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))"]},
{"cell_type":"markdown","metadata":{"id":"TrSXAkATfXJf"},"source":["#### NOTE:
Stemming & lamatization are same to retrieve root words but lamatization is worked
good. Lamatization is slow & stemming is fast"]},
{"cell_type":"code","execution_count":null,"metadata":
{"id":"DcfHxG6TfXJf"},"outputs":[],"source":[]}],"metadata":{"kernelspec":
{"display_name":"Python 3","language":"python","name":"python3"},"language_info":
{"codemirror_mode":
{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-
python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","
version":"3.8.5"},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":0}