Super Visionado VSRegras
Super Visionado VSRegras
ai/
import pandas as pd
from google.colab import files
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")
True
files.upload()
Tweets = pd.read_csv("Tweets2.csv")
Tweets.shape
(74682, 4)
Tweets.head()
Tweets.groupby(['sentiment']).size()
sentiment
Irrelevant 12990
Negative 22542
Neutral 18318
Positive 20832
dtype: int64
Tweets.loc[Tweets['sentiment']=='Irrelevant','sentiment'] = 'Neutral'
Tweets = Tweets.dropna(subset=['text'])
Tweets.reset_index(drop=True, inplace=True)
Tweets.shape
(73996, 4)
Tweets.groupby(['sentiment']).size()
sentiment
Negative 22358
Neutral 30983
Positive 20655
dtype: int64
Supervisionado
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values)
X = token.texts_to_sequences(Tweets['text'].values)
X = pad_sequences(X, padding="post", maxlen=100)
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(Tweets['sentiment'])
print(y)
[2 2 2 ... 2 2 2]
y = to_categorical(y)
print(y)
[[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
...
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]]
modelo = Sequential()
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128,
input_length=X.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0,
activation='tanh',
recurrent_activation='sigmoid', unroll=False,
use_bias=True))
modelo.add(Dense(units=3,activation="softmax"))
/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/
embedding.py:90: UserWarning: Argument `input_length` is deprecated.
Just remove it.
warnings.warn(
modelo.compile(loss='categorical_crossentropy', optimizer='adam',
metrics = ['accuracy'])
print(modelo.summary())
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳
━━━━━━━━━━━━━━━━━┓
┃ Layer (type) ┃ Output Shape ┃
Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇
━━━━━━━━━━━━━━━━━┩
│ embedding_1 (Embedding) │ ? │
0 (unbuilt) │
├──────────────────────────────────────┼─────────────────────────────┼
─────────────────┤
│ spatial_dropout1d_1 │ ? │
0 (unbuilt) │
│ (SpatialDropout1D) │ │
│
├──────────────────────────────────────┼─────────────────────────────┼
─────────────────┤
│ lstm_1 (LSTM) │ ? │
0 (unbuilt) │
├──────────────────────────────────────┼─────────────────────────────┼
─────────────────┤
│ dense_1 (Dense) │ ? │
0 (unbuilt) │
└──────────────────────────────────────┴─────────────────────────────┴
─────────────────┘
None
Epoch 1/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 6s 46ms/step - accuracy: 0.4188 - loss:
1.0838
Epoch 2/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 4s 44ms/step - accuracy: 0.4183 - loss:
1.0831
Epoch 3/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 5s 45ms/step - accuracy: 0.4200 - loss:
1.0821
Epoch 4/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 4s 46ms/step - accuracy: 0.4156 - loss:
1.0837
Epoch 5/5
89/89 ━━━━━━━━━━━━━━━━━━━━ 4s 45ms/step - accuracy: 0.4183 - loss:
1.0825
<keras.src.callbacks.history.History at 0x7d98d00fe200>
_, accuracy = modelo.evaluate(X_test,y_test)
print("Accuracy: ", accuracy)
925/925 ━━━━━━━━━━━━━━━━━━━━ 4s 4ms/step - accuracy: 0.4207 - loss:
1.0815
Accuracy: 0.41920334100723267
VADER
mas = SentimentIntensityAnalyzer()
Tweets['vander_sentiment'] = ''
for y in range(len(Tweets.index)):
x = mas.polarity_scores(Tweets['text'].iloc[y])
del x['compound']
maior = max(x,key=x.get) #neg pos neu
Tweets.loc[y,'vander_sentiment'] = maior
Tweets.groupby(['vander_sentiment']).size()
vander_sentiment
neg 3660
neu 65581
pos 4755
dtype: int64
Tweets.groupby(['sentiment']).size()
sentiment
Negative 22358
Neutral 30983
Positive 20655
dtype: int64
Tweets.groupby(['vander_sentiment']).size()
vander_sentiment
Negative 3660
Neutral 65581
Positive 4755
dtype: int64
y_pred = Tweets['vander_sentiment']
y_test = Tweets['sentiment']
cm = confusion_matrix(y_test, y_pred)
print(cm)
0.44886210065408944