Koding Text Mining
Koding Text Mining
import pandas as pd
data = pd.read_excel("dana_desa.xlsx")
data.head()
# data cleaning
data['Text'].fillna('test', inplace=True)
data.head()
data['Text'] = data['Text'].apply(datacleaning)
data
# case folding
data['Text'] = data['Text'].apply(CaseFolding)
data
# Normalisasi
key_norm = pd.read_csv('key_norm.csv')
def WordNormalization(text):
text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0] if
(key_norm['singkat'] == word).any() else word for word in text.split()])
text = str.lower(text)
return text
data['Text'] = data['Text'].apply(WordNormalization)
data
# Tokenizing
import nltk
nltk.download('punkt')
import nltk
from nltk.tokenize import word_tokenize
def Tokenizing(text):
text = word_tokenize(text)
return text
data['Text'] = data['Text'].apply(Tokenizing)
data
# Stopword
import nltk
nltk.download('stopword')
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('indonesian'))
def Filtering(text):
clean_words = []
for word in text:
if word not in stop_words:
clean_words.append(word)
return " ".join(clean_words)
data['Text'] = data['Text'].apply(Filtering)
Data
# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()
data['Text'] = data['Text'].apply(Stemming)
data
data.to_excel('danadesa_dc.xlsx', index=False)
import pandas as pd
data = pd.read_excel("danadesa_dc.xlsx")
data
data['Text'].fillna('test', inplace=True)
data.head()
import csv
lexicon_positive = dict()
with open('lexicon_positive_ver1.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
lexicon_positive[row[0]] = int(row[1])
lexicon_negative = dict()
with open('lexicon_negative_ver1.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
lexicon_negative[row[0]] = int(row[1])
def sentiment_analysis_lexicon_indonesia(text):
score = 0
for word_pos in text:
if (word_pos in lexicon_positive):
score = score + lexicon_positive[word_pos]
for word_neg in text:
if (word_neg in lexicon_negative):
score = score + lexicon_negative[word_neg]
Sentimen=''
if (score > 0):
Sentimen = 'positif'
elif (score < 0):
Sentimen = 'negatif'
else:
Sentimen = 'netral'
return score, Sentimen
data['Text'] = data.Text.str.split()
data
results = data['Text'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
data['score'] = results[0]
data['Sentimen'] = results[1]
print(data['Sentimen'].value_counts())
data
def convert_tokens_to_text(tokens):
text = ' '.join(tokens)
return text
data['Text'] = data['Text'].apply(convert_tokens_to_text)
print(data)
plt.show()
data.to_excel('danadesa_lexicon.xlsx', index=False)
Pemodelan Menggunakan Metode C.45
import pandas as pd
data = pd.read_excel("danadesa_lexicon.xlsx")
data.head()
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
Pemodelan SVM
import pandas as pd
data = pd.read_excel("danadesa_lexicon.xlsx")
data.head()
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
svm=svm.SVC(class_weight=None,C=1,gamma=0.1,kernel='linear',random_state=10
0, probability=True)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
cm = confusion_matrix(y_test, svm_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
Pemodelan KNN
import pandas as pd
data = pd.read_excel("daring_lexicon.xlsx")
data.head()
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
errors = []
for k in range(1, 20):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
errors.append(1 - knn.score(X_test, y_test))
cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()
import pandas as pd
data = pd.read_excel("daring_lexicon.xlsx")
data.head()
tf_idf = TfidfVectorizer()
tf_idf.fit(X)
X_tf_idf = tf_idf.transform(X).toarray()
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
label_names = np.unique(y)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names,
yticklabels=label_names)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()