Codingan Materi
Codingan Materi
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil
CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'gojek-app-reviews-bahasa-indonesia:https%3A%2F%2Ffanyv88.com%3A443%2Fhttps%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4394044%2F7545387%2Fbundle
KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'
try:
os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
pass
try:
os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
pass
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 1/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
Downloaded and uncompressed: gojek-app-reviews-bahasa-indonesia
Data source import complete.
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save &
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/gojek-app-reviews-bahasa-indonesia/GojekAppReviewV4.0.0-V4.9.3_Cleaned.csv
2022-01-21
0 Yuga Edit akun gopay saya di blok 1 4.9.3
10:52:12
sum(df['appVersion'].str.startswith("4.8"))
8091
df.head()
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 2/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
# tokenization
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
# hapus duplikasi
df = df.dropna(subset=['content']).drop_duplicates()
# stopwords
stop_words = stopwords.words('indonesian') + stopwords.words('english') + ["yg", "gak", "ngisi", "udah", "d", "sih", "nya", "srg", "utk", "b
df['content'] = df['content'].apply(lambda x: [word.lower() for word in word_tokenize(x) if (word.isalpha() and word.lower() not in stop_wor
# normalisasi teks
df['content'] = df['content'].apply(lambda x: ' '.join(x))
df.head()
Collecting Sastrawi
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.7/209.7 kB 2.0 MB/s eta 0:00:00
Installing collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Collecting VaderSentiment
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.0/126.0 kB 1.3 MB/s eta 0:00:00
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from VaderSentiment) (2.31.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (3.3.
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (2024.2.2)
Installing collected packages: VaderSentiment
Successfully installed VaderSentiment-3.3.2
# stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stemmer = StemmerFactory().create_stemmer()
df['content'] = df['content'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df.head(5)
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 3/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
# labelling
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
additional_lexicon_id = {
'kecewa': -0.4,
'rugi': -1,
'buruk': -0.6,
'jelek': -0.6,
'lelet': -0.7,
'gagal': -0.5,
'parah': -0.6,
'mahal': -0.3,
'tolong': -0.1,
'hilang': -0.3,
'gajelas': -0.3,
'gj': -0.3,
'promo': 0.6,
'kadang': -0.1,
'maling': -0.5,
'ganggu': 0.3,
'sedot': -0.5,
'bagus': 0.5,
'pulsa': 0,
'potong': -1,
'baik': 0.5,
'kntl': -1,
'ngelag': -0.8,
'salah': -0.5,
'bintang': 0,
'benerin': -0.4,
'lambat': -0.8,
'siput': -0.4,
'mati': -0.7,
'minimal': -0.3,
'susah': -0.6,
'nagih': -0.6,
'capek': -0.7,
'kacau': -0.3,
'tagih': -0.3,
'mantap': 1,
'puas': 0.9,
'sampah': -0.5,
'sulit': -0.6,
'aneh': -0.4,
}
analyzer.lexicon.update(additional_lexicon_id)
df
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 4/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
8221 Mutiara Purnama bye bye gojek alih ijo lah mantapp 1 Netral
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['content'])
keyboard_arrow_down WordCloud
plt.imshow(wordcloud_netral, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud dari Sentimen Netral')
plt.show()
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 5/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
df_positif = df[df['sentimen'] == 'Positif']
all_words_positif = ' '.join([twts for twts in df_positif['content']])
wordcloud_positif = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_positif)
plt.imshow(wordcloud_positif, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud dari Sentimen Positif')
plt.show()
plt.imshow(wordcloud_negatif, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud dari Sentimen Negatif')
plt.show()
temp = df.groupby('sentimen').count()['content'].reset_index().sort_values(by='content',ascending=False)
temp.style.background_gradient(cmap='inferno_r')
sentimen content
1 Netral 3987
2 Positif 2541
0 Negatif 1562
plt.figure(figsize=(12,6))
sns.countplot(x='sentimen',data=df)
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 6/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
fig = go.Figure(go.Funnelarea(
text =temp.sentimen,
values = temp.content,
title = {"position": "top center", "text": "Funnel-Chart dari Distribusi target"}
))
fig.show()
Funnel-Chart
Collecting palettable
Downloading palettable-3.3.3-py2.py3-none-any.whl (332 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 332.3/332.3 kB 2.2 MB/s eta 0:00:00
Installing collected packages: palettable
Successfully installed palettable-3.3.3
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 7/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
unique_netral_words = df_netral['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_netral_words.columns = ['words', 'count']
top_20_words = unique_netral_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot dari Sentimen Netral')
plt.show()
unique_positif_words = df_positif['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_positif_words.columns = ['words', 'count']
top_20_words = unique_positif_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot dari Sentimen positif')
plt.show()
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 8/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
unique_negatif_words = df_negatif['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_negatif_words.columns = ['words', 'count']
top_20_words = unique_negatif_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot dari Sentimen negatif')
plt.show()
keyboard_arrow_down SPLIT
# splitting
from sklearn.model_selection import train_test_split
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
plt.figure(figsize=(12, 6))
sns.countplot(x=y_train)
plt.title('Distribusi target untuk modeling')
plt.show()
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 9/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
keyboard_arrow_down Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# init parameters
rf_param_grid = {'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]}
▸ RandomizedSearchCV
▸ estimator: RandomForestClassifier
▸ RandomForestClassifier
Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
# evaluasi model
from sklearn.metrics import classification_report
y_pred_rf = rf_model.best_estimator_.predict(X_test)
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 10/10