0% found this document useful (0 votes)
4 views10 pages

Koding Text Mining

The document outlines a comprehensive data preprocessing and sentiment analysis workflow using Python, including data cleaning, normalization, tokenization, stopword removal, stemming, and sentiment labeling using a lexicon-based approach. It also describes various modeling techniques such as Decision Trees (C4.5), Support Vector Machines (SVM), K-Nearest Neighbors (KNN), Random Forest, and Naive Bayes for sentiment classification, along with performance evaluation metrics. The final results are visualized using confusion matrices and accuracy scores for each model.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views10 pages

Koding Text Mining

The document outlines a comprehensive data preprocessing and sentiment analysis workflow using Python, including data cleaning, normalization, tokenization, stopword removal, stemming, and sentiment labeling using a lexicon-based approach. It also describes various modeling techniques such as Decision Trees (C4.5), Support Vector Machines (SVM), K-Nearest Neighbors (KNN), Random Forest, and Naive Bayes for sentiment classification, along with performance evaluation metrics. The final results are visualized using confusion matrices and accuracy scores for each model.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

PROSES PREPROCESSING

import pandas as pd

pd.set_option('display.max_columns', None) # Menampilkan semua kolom


pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom tanpa
pemotongan

data = pd.read_excel("dana_desa.xlsx")
data.head()

# data cleaning

# menghapus kolom yang tidak akan digunakan


data = data.drop(columns=['Datetime','Tweet_Id','Username','label','kata_kunci'])
data

data['Text'].fillna('test', inplace=True)
data.head()

import re # Mengimpor modul re


# membuat fungsi untuk data cleaning
def datacleaning(text):
text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mentions
text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
text = re.sub(r'RT[\s]', '', text) # menghapus retweet
text = re.sub(r'[?|$|.|@#%^/&*=!_:")(-+,]', '', text) # menghapus simbol
text = re.sub(r"http\S+", '', text) # menghapus link
text = re.sub(r'[0-9]+', '', text) # menghapus angka
text = text.replace('\n', ' ') # mengganti baris baru menjadi spasi
text = text.strip(' ') # hapus spasi dari kiri dan kanan teks
return text

data['Text'] = data['Text'].apply(datacleaning)
data

# case folding

def CaseFolding(text): # mengubah semua kata menjadi bentuk lower case


text = text.lower()
return text

data['Text'] = data['Text'].apply(CaseFolding)
data
# Normalisasi

key_norm = pd.read_csv('key_norm.csv')

def WordNormalization(text):
text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if
(key_norm['singkat'] == word).any() else word for word in text.split()])
text = str.lower(text)
return text

data['Text'] = data['Text'].apply(WordNormalization)
data

# Tokenizing

import nltk
nltk.download('punkt')

import nltk
from nltk.tokenize import word_tokenize

def Tokenizing(text):
text = word_tokenize(text)
return text

data['Text'] = data['Text'].apply(Tokenizing)
data

# Stopword

import nltk
nltk.download('stopword')

import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('indonesian'))

def Filtering(text):
clean_words = []
for word in text:
if word not in stop_words:
clean_words.append(word)
return " ".join(clean_words)

data['Text'] = data['Text'].apply(Filtering)
Data
# Stemming

pip install Sastrawi

pip install --upgrade pip Setuptools

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Buat fungsi untuk langkah stemming bahasa Indonesia


def Stemming(text):
text = stemmer.stem(text)
return text

data['Text'] = data['Text'].apply(Stemming)
data

data.to_excel('danadesa_dc.xlsx', index=False)

PROSES PELABELAN (LEXICON BASE)

import pandas as pd

data = pd.read_excel("danadesa_dc.xlsx")
data

data['Text'].fillna('test', inplace=True)
data.head()

import csv

lexicon_positive = dict()
with open('lexicon_positive_ver1.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
lexicon_positive[row[0]] = int(row[1])

lexicon_negative = dict()
with open('lexicon_negative_ver1.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
lexicon_negative[row[0]] = int(row[1])

def sentiment_analysis_lexicon_indonesia(text):
score = 0
for word_pos in text:
if (word_pos in lexicon_positive):
score = score + lexicon_positive[word_pos]
for word_neg in text:
if (word_neg in lexicon_negative):
score = score + lexicon_negative[word_neg]
Sentimen=''
if (score > 0):
Sentimen = 'positif'
elif (score < 0):
Sentimen = 'negatif'
else:
Sentimen = 'netral'
return score, Sentimen

data['Text'] = data.Text.str.split()
data

results = data['Text'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
data['score'] = results[0]
data['Sentimen'] = results[1]
print(data['Sentimen'].value_counts())

data

def convert_tokens_to_text(tokens):
text = ' '.join(tokens)
return text

data['Text'] = data['Text'].apply(convert_tokens_to_text)

print(data)

# menghapus kolom yang tidak akan digunakan


data = data.drop(columns=['score'])
data

import matplotlib.pyplot as plt


import seaborn as sns

fig, axes = plt.subplots()

sns.histplot(data=data, x='Sentimen', color='skyblue')

plt.show()

data.to_excel('danadesa_lexicon.xlsx', index=False)
Pemodelan Menggunakan Metode C.45

import pandas as pd

pd.set_option('display.max_columns', None) # Menampilkan semua kolom


pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom
tanpa pemotongan

data = pd.read_excel("danadesa_lexicon.xlsx")
data.head()

# Pisahkan kolom fitur dan target


X = data['Text']
y = data['Sentimen']

from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()
tf_idf.fit(X)

X_tf_idf = tf_idf.transform(X).toarray()

data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())


data_tf_idf

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,


random_state = 37)

from sklearn.tree import DecisionTreeClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dc_classifier = DecisionTreeClassifier(max_depth=3, criterion="entropy")


dc_classifier.fit(X_train, y_train)
y_pred = dc_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)

plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

print("Accuracy Score untuk C4.5 Model :: ", accuracy_score(y_test, y_pred))


print(classification_report(y_test, y_pred, zero_division=0))

Pemodelan SVM

import pandas as pd

pd.set_option('display.max_columns', None) # Menampilkan semua kolom


pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom
tanpa pemotongan

data = pd.read_excel("danadesa_lexicon.xlsx")
data.head()

# Pisahkan kolom fitur dan target


X = data['Text']
y = data['Sentimen']

from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()
tf_idf.fit(X)

X_tf_idf = tf_idf.transform(X).toarray()

data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())


data_tf_idf

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,


random_state = 37)

from sklearn import svm


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

svm=svm.SVC(class_weight=None,C=1,gamma=0.1,kernel='linear',random_state=10
0, probability=True)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
cm = confusion_matrix(y_test, svm_pred)
label_names = np.unique(y)

plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

print("Accuracy Score untuk Support Vector Machine Model :: ",


accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred, zero_division=0))

Pemodelan KNN

import pandas as pd

pd.set_option('display.max_columns', None) # Menampilkan semua kolom


pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom tanpa
pemotongan

data = pd.read_excel("daring_lexicon.xlsx")
data.head()

# Pisahkan kolom fitur dan target


X = data['Mention']
y = data['Sentimen']

from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()
tf_idf.fit(X)

X_tf_idf = tf_idf.transform(X).toarray()

data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())


data_tf_idf

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,


random_state = 37)

from sklearn.neighbors import KNeighborsClassifier


import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns

errors = []
for k in range(1, 20):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
errors.append(1 - knn.score(X_test, y_test))

# Plot elbow method


plt.plot(range(1, 20), errors, marker='o')
plt.xlabel('Jumlah tetangga (k)')
plt.ylabel('Error')
plt.title('Elbow Method')
plt.show()

# Menentukan nilai k terbaik berdasarkan elbow method


best_k = errors.index(min(errors)) + 1
print("Nilai k terbaik: ", best_k)

# Melatih model KNN dengan nilai k terbaik


knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

# Prediksi sentimen pada data uji


y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)

plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

print(classification_report(y_test, y_pred, zero_division=0))


print("\nKNN :" , accuracy_score(y_test, y_pred))
print('-------------------------------------------')

Pemodelan Random Forest

import pandas as pd

pd.set_option('display.max_columns', None) # Menampilkan semua kolom


pd.set_option('display.max_colwidth', None) # Menampilkan isi seluruh kolom tanpa
pemotongan

data = pd.read_excel("daring_lexicon.xlsx")
data.head()

# Pisahkan kolom fitur dan target


X = data['Mention']
y = data['Sentimen']

from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()
tf_idf.fit(X)

X_tf_idf = tf_idf.transform(X).toarray()

data_tf_idf = pd.DataFrame(X_tf_idf, columns=tf_idf.get_feature_names_out())


data_tf_idf

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, test_size = 0.3,


random_state = 37)

from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)

plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

print("Accuracy Score untuk RandomForestClassifier Model :: ",


accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))
Pemodelan Naive Bayes

You might also like