{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "28667795-d93e-427f-9063-13ad6e133b09", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 3, "id": "81dd7556-356c-428f-98ee-87d94c1f9a5a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabeltweet
010@user when a father is dysfunctional and is s...
120@user @user thanks for #lyft credit i can't us...
230bihday your majesty
340#model i love u take with u all the time in ...
450factsguide: society now #motivation
\n", "
" ], "text/plain": [ " id label tweet\n", "0 1 0 @user when a father is dysfunctional and is s...\n", "1 2 0 @user @user thanks for #lyft credit i can't us...\n", "2 3 0 bihday your majesty\n", "3 4 0 #model i love u take with u all the time in ...\n", "4 5 0 factsguide: society now #motivation" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets = pd.read_csv(\"/home/renato/Downloads/archive/train_E6oV3lV.csv\")\n", "tweets.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "27e6bdea-6fa6-4b7c-a365-219eef16a005", "metadata": {}, "outputs": [], "source": [ "words = tweets.tweet.sum().split()" ] }, { "cell_type": "code", "execution_count": 6, "id": "0ff751d7-b97d-4141-b27a-238fb1d98a5b", "metadata": {}, "outputs": [], "source": [ "freqdist = Counter(words)" ] }, { "cell_type": "code", "execution_count": 7, "id": "f97bbe67-6242-4dd2-b6b9-5c95c6054248", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('@user', 15856),\n", " ('the', 9934),\n", " ('to', 9758),\n", " ('a', 6202),\n", " ('i', 5240),\n", " ('you', 4895),\n", " ('and', 4821),\n", " ('in', 4552),\n", " ('for', 4427),\n", " ('of', 4151),\n", " ('is', 4088),\n", " ('my', 3533),\n", " ('on', 2542),\n", " ('with', 2474),\n", " ('this', 2377),\n", " ('be', 2349),\n", " ('it', 2080),\n", " ('so', 1820),\n", " ('all', 1802),\n", " ('are', 1778)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "freqdist.most_common(20)" ] }, { "cell_type": "code", "execution_count": 8, "id": "d09d361b-e5d4-42c2-9614-39ee524b1eb1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['@user',\n", " 'when',\n", " 'a',\n", " 'father',\n", " 'is',\n", " 'dysfunctional',\n", " 'and',\n", " 'is',\n", " 'so',\n", " 'selfish']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words[0:10]" ] }, { "cell_type": "code", "execution_count": 9, "id": "e8cad3c8-ce24-470e-b52e-606897cd0a31", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "414005" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(words)" ] }, { "cell_type": "code", "execution_count": 10, "id": "9b9dda20-dbc4-4295-a274-04aaa47a396f", "metadata": {}, "outputs": [], "source": [ "setofwords = set(words)" ] }, { "cell_type": "code", "execution_count": 11, "id": "7ef45e72-cac8-40bc-a66b-ce71b14680b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "70523" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(setofwords)" ] }, { "cell_type": "code", "execution_count": 12, "id": "ca485f2c-ba0c-42a3-bb68-4ef6e6838904", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package words to /home/renato/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "nltk.download('words')" ] }, { "cell_type": "code", "execution_count": 13, "id": "0070f1bd-d290-4643-9b97-07f6cc625993", "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import words as corpus" ] }, { "cell_type": "code", "execution_count": 14, "id": "b23787c7-5f99-490a-9aec-2ff74d044031", "metadata": {}, "outputs": [], "source": [ "out_of_dictionary = [w for w in setofwords if w not in corpus.words()]" ] }, { "cell_type": "code", "execution_count": 15, "id": "edb561ef-369c-4389-a24d-bb1d73702090", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['boxâ\\x80¦',\n", " 'halloween#family',\n", " 'when?????',\n", " 'wars',\n", " 'hea!',\n", " '#festivals',\n", " '#bahrain',\n", " \"'great'\",\n", " 'muslims\"',\n", " '#reboot']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out_of_dictionary[0:10]" ] }, { "cell_type": "code", "execution_count": null, "id": "07216bde-d2ee-4027-a2c6-cb1da37cff62", "metadata": {}, "outputs": [], "source": [ "out_of_dictionaryex1 = \"(\\#\\w+)\"\n", "tweets.tweet.str.extract(regex1)" ] }, { "cell_type": "code", "execution_count": null, "id": "1213c098-9683-4e32-ad0c-38597b176f74", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }