0% found this document useful (0 votes)
21 views6 pages

Super Visionado VSRegras

Uploaded by

debora.brigida
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views6 pages

Super Visionado VSRegras

Uploaded by

debora.brigida
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

###Prof. Fernando Amaral https://www.eia.

ai/

import pandas as pd
from google.colab import files
import numpy as np

from sklearn.model_selection import train_test_split


from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential


from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...

True

files.upload()

Tweets = pd.read_csv("Tweets2.csv")
Tweets.shape

(74682, 4)

Tweets.head()

{"summary":"{\n \"name\": \"Tweets\",\n \"rows\": 74682,\n


\"fields\": [\n {\n \"column\": \"id\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 3740,\n
\"min\": 1,\n \"max\": 13200,\n \"num_unique_values\":
12447,\n \"samples\": [\n 1616,\n 2660,\n
2335\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"local\",\n \"properties\": {\n \"dtype\": \"category\",\
n \"num_unique_values\": 32,\n \"samples\": [\n
\"Cyberpunk2077\",\n \"Microsoft\",\n
\"TomClancysRainbowSix\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"sentiment\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"Neutral\",\n \"Irrelevant\",\n
\"Positive\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"text\",\n \"properties\": {\n \"dtype\": \"string\",\n
\"num_unique_values\": 69491,\n \"samples\": [\n
\"Thanks to @ Kain0025 for the raid. Thanks to @ gamingstreams and @
velonese002 for the bitts! And thanks to @ ColTrysTohete for hanging
out and hanging out!. I hope to continue streaming regularly.. watch
the w / @ Cohtstreams _ coming live!\",\n \"How not to get
bored about every damn thing in life.\",\n \"The Best Way to
Protect the Samsung Galaxy Note10+ buff.ly/2zkjIhU <unk> ^\"\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"Tweets"}

Tweets.groupby(['sentiment']).size()

sentiment
Irrelevant 12990
Negative 22542
Neutral 18318
Positive 20832
dtype: int64

Tweets.loc[Tweets['sentiment']=='Irrelevant','sentiment'] = 'Neutral'

Tweets = Tweets.dropna(subset=['text'])
Tweets.reset_index(drop=True, inplace=True)

Tweets.shape

(73996, 4)

Tweets.groupby(['sentiment']).size()

sentiment
Negative 22358
Neutral 30983
Positive 20655
dtype: int64

Supervisionado
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values)

X = token.texts_to_sequences(Tweets['text'].values)
X = pad_sequences(X, padding="post", maxlen=100)

labelencoder = LabelEncoder()
y = labelencoder.fit_transform(Tweets['sentiment'])
print(y)

[2 2 2 ... 2 2 2]
y = to_categorical(y)
print(y)

[[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
...
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =


0.4)
X_test

array([[95, 99, 3, ..., 0, 0, 0],


[13, 18, 4, ..., 0, 0, 0],
[10, 7, 5, ..., 0, 0, 0],
...,
[37, 11, 7, ..., 0, 0, 0],
[ 0, 0, 0, ..., 0, 0, 0],
[87, 7, 17, ..., 0, 0, 0]], dtype=int32)

modelo = Sequential()
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128,
input_length=X.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0,
activation='tanh',
recurrent_activation='sigmoid', unroll=False,
use_bias=True))
modelo.add(Dense(units=3,activation="softmax"))

/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/
embedding.py:90: UserWarning: Argument `input_length` is deprecated.
Just remove it.
warnings.warn(

modelo.compile(loss='categorical_crossentropy', optimizer='adam',
metrics = ['accuracy'])
print(modelo.summary())

Model: "sequential_1"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳
━━━━━━━━━━━━━━━━━┓
┃ Layer (type) ┃ Output Shape ┃
Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇
━━━━━━━━━━━━━━━━━┩
│ embedding_1 (Embedding) │ ? │
0 (unbuilt) │
├──────────────────────────────────────┼─────────────────────────────┼
─────────────────┤
│ spatial_dropout1d_1 │ ? │
0 (unbuilt) │
│ (SpatialDropout1D) │ │

├──────────────────────────────────────┼─────────────────────────────┼
─────────────────┤
│ lstm_1 (LSTM) │ ? │
0 (unbuilt) │
├──────────────────────────────────────┼─────────────────────────────┼
─────────────────┤
│ dense_1 (Dense) │ ? │
0 (unbuilt) │
└──────────────────────────────────────┴─────────────────────────────┴
─────────────────┘

Total params: 0 (0.00 B)

Trainable params: 0 (0.00 B)

Non-trainable params: 0 (0.00 B)

None

modelo.fit(X_train, y_train, epochs=5, batch_size=500,verbose=True)

Epoch 1/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 6s 46ms/step - accuracy: 0.4188 - loss:
1.0838
Epoch 2/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 4s 44ms/step - accuracy: 0.4183 - loss:
1.0831
Epoch 3/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 5s 45ms/step - accuracy: 0.4200 - loss:
1.0821
Epoch 4/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 4s 46ms/step - accuracy: 0.4156 - loss:
1.0837
Epoch 5/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 4s 45ms/step - accuracy: 0.4183 - loss:
1.0825

<keras.src.callbacks.history.History at 0x7d98d00fe200>

_, accuracy = modelo.evaluate(X_test,y_test)
print("Accuracy: ", accuracy)
925/925 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.4207 - loss:
1.0815
Accuracy: 0.41920334100723267

VADER
mas = SentimentIntensityAnalyzer()
Tweets['vander_sentiment'] = ''

for y in range(len(Tweets.index)):
x = mas.polarity_scores(Tweets['text'].iloc[y])
del x['compound']
maior = max(x,key=x.get) #neg pos neu
Tweets.loc[y,'vander_sentiment'] = maior

Tweets.groupby(['vander_sentiment']).size()

vander_sentiment
neg 3660
neu 65581
pos 4755
dtype: int64

Tweets.groupby(['sentiment']).size()

sentiment
Negative 22358
Neutral 30983
Positive 20655
dtype: int64

Tweets.loc[Tweets['vander_sentiment']== 'neu' , 'vander_sentiment'] =


'Neutral'
Tweets.loc[Tweets['vander_sentiment']== 'neg' , 'vander_sentiment'] =
'Negative'
Tweets.loc[Tweets['vander_sentiment']== 'pos' , 'vander_sentiment'] =
'Positive'

Tweets.groupby(['vander_sentiment']).size()

vander_sentiment
Negative 3660
Neutral 65581
Positive 4755
dtype: int64

y_pred = Tweets['vander_sentiment']
y_test = Tweets['sentiment']
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 2004 19902 452]


[ 1122 28384 1477]
[ 534 17295 2826]]

accuracy = accuracy_score(y_test, y_pred)


print(accuracy)

0.44886210065408944

You might also like