0% found this document useful (0 votes)
21 views10 pages

Codingan Materi

Uploaded by

David Baridji
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views10 pages

Codingan Materi

Uploaded by

David Baridji
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

4/2/24, 5:39 PM Analisis sentimen Gojek V4.

8 | Random Forest - Colaboratory

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'gojek-app-reviews-bahasa-indonesia:https%3A%2F%2Ffanyv88.com%3A443%2Fhttps%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4394044%2F7545387%2Fbundle

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null


shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
pass
try:
os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):


directory, download_url_encoded = data_source_mapping.split(':')
download_url = unquote(download_url_encoded)
filename = urlparse(download_url).path
destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
try:
with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
total_length = fileres.headers['content-length']
print(f'Downloading {directory}, {total_length} bytes compressed')
dl = 0
data = fileres.read(CHUNK_SIZE)
while len(data) > 0:
dl += len(data)
tfile.write(data)
done = int(50 * dl / int(total_length))
sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
sys.stdout.flush()
data = fileres.read(CHUNK_SIZE)
if filename.endswith('.zip'):
with ZipFile(tfile) as zfile:
zfile.extractall(destination_path)
else:
with tarfile.open(tfile.name) as tarfile:
tarfile.extractall(destination_path)
print(f'\nDownloaded and uncompressed: {directory}')
except HTTPError as e:
print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
continue
except OSError as e:
print(f'Failed to load {download_url} to path {destination_path}')
continue

print('Data source import complete.')

account_circle Downloading gojek-app-reviews-bahasa-indonesia, 8220381 bytes compressed


[==================================================] 8220381 bytes downloaded

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 1/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
Downloaded and uncompressed: gojek-app-reviews-bahasa-indonesia
Data source import complete.

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory


# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save &
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gojek-app-reviews-bahasa-indonesia/GojekAppReviewV4.0.0-V4.9.3_Cleaned.csv

keyboard_arrow_down Load Data


df = pd.read_csv('/kaggle/input/gojek-app-reviews-bahasa-indonesia/GojekAppReviewV4.0.0-V4.9.3_Cleaned.csv')
df.head()

userName content score at appVersion

2022-01-21
0 Yuga Edit akun gopay saya di blok 1 4.9.3
10:52:12

Lambat sekali sekarang ini 2021-11-30


1 ff burik 3 4.9.3
bosssku apk gojek g... 15:40:38

Anisa Suci Kenapa sih dari kemarin sy 2021-11-29


2 4 4.9.3
Rahmayuliani buka aplikasi gojek... 22:58:12

Baru download gojek dan hape 2022-09-03


3 naoki yakuza 1 493

keyboard_arrow_down EDA & Preprocessing


import seaborn as sns
import matplotlib.pyplot as plt

sum(df['appVersion'].str.startswith("4.8"))

8091

# ambil kolom yg dibutuhkan


df = df[df['appVersion'].str.startswith("4.8")]
df = df.loc[:, ['userName', 'content', 'score']]

df.head()

userName content score

133 Abu karim aljabbar Mkatiksaidi Ramah banget 5

134 Fathan Mubina Setelah update kok nggak bisa dibuka 4

135 Nyauw Jin Fie Good 5

136 Tanaka Kun Good 5

137 Anton S. Sangat membantu 5

Next steps: toggle_off View recommended plots

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 2/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
# tokenization
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# hapus duplikasi
df = df.dropna(subset=['content']).drop_duplicates()

# stopwords
stop_words = stopwords.words('indonesian') + stopwords.words('english') + ["yg", "gak", "ngisi", "udah", "d", "sih", "nya", "srg", "utk", "b
df['content'] = df['content'].apply(lambda x: [word.lower() for word in word_tokenize(x) if (word.isalpha() and word.lower() not in stop_wor

# normalisasi teks
df['content'] = df['content'].apply(lambda x: ' '.join(x))

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...


[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
userName content score

133 Abu karim aljabbar Mkatiksaidi ramah banget 5

134 Fathan Mubina update nggak dibuka 4

135 Nyauw Jin Fie good 5

136 Tanaka Kun good 5

137 Anton S. membantu 5

Next steps: toggle_off View recommended plots

!pip install Sastrawi


!pip install VaderSentiment

Collecting Sastrawi
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.7/209.7 kB 2.0 MB/s eta 0:00:00
Installing collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Collecting VaderSentiment
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 126.0/126.0 kB 1.3 MB/s eta 0:00:00
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from VaderSentiment) (2.31.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (3.3.
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->VaderSentiment) (2024.2.2)
Installing collected packages: VaderSentiment
Successfully installed VaderSentiment-3.3.2

# stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()
df['content'] = df['content'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

df.head(5)

userName content score

133 Abu karim aljabbar Mkatiksaidi ramah banget 5

134 Fathan Mubina update nggak buka 4

135 Nyauw Jin Fie good 5

136 Tanaka Kun good 5

137 Anton S. bantu 5

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 3/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory

Next steps: toggle_off View recommended plots

# labelling
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

additional_lexicon_id = {
'kecewa': -0.4,
'rugi': -1,
'buruk': -0.6,
'jelek': -0.6,
'lelet': -0.7,
'gagal': -0.5,
'parah': -0.6,
'mahal': -0.3,
'tolong': -0.1,
'hilang': -0.3,
'gajelas': -0.3,
'gj': -0.3,
'promo': 0.6,
'kadang': -0.1,
'maling': -0.5,
'ganggu': 0.3,
'sedot': -0.5,
'bagus': 0.5,
'pulsa': 0,
'potong': -1,
'baik': 0.5,
'kntl': -1,
'ngelag': -0.8,
'salah': -0.5,
'bintang': 0,
'benerin': -0.4,
'lambat': -0.8,
'siput': -0.4,
'mati': -0.7,
'minimal': -0.3,
'susah': -0.6,
'nagih': -0.6,
'capek': -0.7,
'kacau': -0.3,
'tagih': -0.3,
'mantap': 1,
'puas': 0.9,
'sampah': -0.5,
'sulit': -0.6,
'aneh': -0.4,
}

analyzer.lexicon.update(additional_lexicon_id)

df['sentimen'] = df['content'].apply(lambda x: 'Positif' if analyzer.polarity_scores(x)['compound'] > 0 else ('Negatif' if analyzer.polarity

df

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 4/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory

userName content score sentimen

Abu karim aljabbar


133 ramah banget 5 Netral
Mkatiksaidi

134 Fathan Mubina update nggak buka 4 Netral

135 Nyauw Jin Fie good 5 Positif

136 Tanaka Kun good 5 Positif

137 Anton S. bantu 5 Netral

... ... ... ... ...

8219 Malik Azis gopay mantap 5 Positif

buka aplnya hapus download beberpa


8220 Zunus 1 Netral
kali restar...

8221 Mutiara Purnama bye bye gojek alih ijo lah mantapp 1 Netral

8222 Melanie Anggi fitur gomed ya 5 Netral

8223 Ade Nurul Hidayah go food mahal ah 1 Negatif

Next steps: toggle_off View recommended plots

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['content'])

keyboard_arrow_down Analisis Sentimen


from wordcloud import WordCloud
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

keyboard_arrow_down WordCloud

df_netral = df[df['sentimen'] == 'Netral']


all_words_netral = ' '.join([twts for twts in df_netral['content']])
wordcloud_netral = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_netral)

plt.imshow(wordcloud_netral, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud dari Sentimen Netral')
plt.show()

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 5/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
df_positif = df[df['sentimen'] == 'Positif']
all_words_positif = ' '.join([twts for twts in df_positif['content']])
wordcloud_positif = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_positif)

plt.imshow(wordcloud_positif, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud dari Sentimen Positif')
plt.show()

df_negatif = df[df['sentimen'] == 'Negatif']


all_words_negatif = ' '.join([twts for twts in df_negatif['content']])
wordcloud_negatif = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(all_words_negatif)

plt.imshow(wordcloud_negatif, interpolation="bilinear")
# plt.axis('off')
plt.title('Word Cloud dari Sentimen Negatif')
plt.show()

keyboard_arrow_down Distribusi Target

temp = df.groupby('sentimen').count()['content'].reset_index().sort_values(by='content',ascending=False)
temp.style.background_gradient(cmap='inferno_r')

sentimen content

1 Netral 3987

2 Positif 2541

0 Negatif 1562

plt.figure(figsize=(12,6))
sns.countplot(x='sentimen',data=df)

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 6/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory

<Axes: xlabel='sentimen', ylabel='count'>

fig = go.Figure(go.Funnelarea(
text =temp.sentimen,
values = temp.content,
title = {"position": "top center", "text": "Funnel-Chart dari Distribusi target"}
))
fig.show()

Funnel-Chart

!pip install palettable


from palettable.colorbrewer.qualitative import Pastel1_7

Collecting palettable
Downloading palettable-3.3.3-py2.py3-none-any.whl (332 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 332.3/332.3 kB 2.2 MB/s eta 0:00:00
Installing collected packages: palettable
Successfully installed palettable-3.3.3

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 7/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
unique_netral_words = df_netral['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_netral_words.columns = ['words', 'count']
top_20_words = unique_netral_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot dari Sentimen Netral')
plt.show()

unique_positif_words = df_positif['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_positif_words.columns = ['words', 'count']
top_20_words = unique_positif_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot dari Sentimen positif')
plt.show()

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 8/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory
unique_negatif_words = df_negatif['content'].str.split(expand=True).stack().value_counts().reset_index()
unique_negatif_words.columns = ['words', 'count']
top_20_words = unique_negatif_words.head(12)
plt.figure(figsize=(12, 6))
my_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(top_20_words['count'], labels=top_20_words['words'], colors=Pastel1_7.hex_colors)
plt.gca().add_artist(my_circle)
plt.title('Donut Plot dari Sentimen negatif')
plt.show()

keyboard_arrow_down SPLIT

# splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentimen'], test_size=0.2, random_state=42)


X_train.shape, X_test.shape

((6472, 6961), (1618, 6961))

keyboard_arrow_down Resampling target

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

plt.figure(figsize=(12, 6))
sns.countplot(x=y_train)
plt.title('Distribusi target untuk modeling')
plt.show()

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 9/10
4/2/24, 5:39 PM Analisis sentimen Gojek V4.8 | Random Forest - Colaboratory

keyboard_arrow_down Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# init parameters
rf_param_grid = {'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]}

keyboard_arrow_down Random Forest


rf_model = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
rf_model.fit(X_train, y_train)

▸ RandomizedSearchCV
▸ estimator: RandomForestClassifier

▸ RandomForestClassifier

keyboard_arrow_down Model Eval


# print best param
print("\nBest Parameters for Random Forest:", rf_model.best_params_)

Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}

# evaluasi model
from sklearn.metrics import classification_report

y_pred_rf = rf_model.best_estimator_.predict(X_test)

print("\n\nClassification Report for Random Forest (Tuned):")


print(classification_report(y_test, y_pred_rf))

Classification Report for Random Forest (Tuned):


precision recall f1-score support

Negatif 0.84 0.93 0.88 310


Netral 0.96 0.96 0.96 803
Positif 0.98 0.91 0.94 505

https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/analisis-sentimen-gojek-v4-8-rando… 10/10

You might also like