text = pd.
DataFrame({'review_text': data['text']})
text.head()
train, test = train_test_split(text,test_size=0.20, random_state=50)
# Cek apakah ada data null (missing values) di dataset
print("Jumlah missing values pada dataset train:")
print(train.isnull().sum())
print("\nJumlah missing values pada dataset test:")
print(test.isnull().sum())
def create_corpus(data, var):
corpus=[]
for x in data[var].str.split():
for i in x:
corpus.append(i)
return corpus
def remove_indonesian_stop(data, feature) :
stopwords = fix_stopwords
filtered = []
for text in data[feature] :
text_list = []
for word in text.split() :
if word not in stopwords :
text_list.append(word)
text_list = ' '.join(text_list)
filtered.append(text_list)
return filtered
def num_count(data, feature) :
#kompas_list = list(set(kompas['words']))
num_list = []
for sentence in tqdm(data[feature]) :
num = 0
for word in sentence :
if word.isdigit() :
num += 1
else :
num += 0
num_list.append(num)
return num_list
def plot_unique_word_count(corpus, width, height, range1, range2, title, color, ax = None) :
words = []
values = []
len_words = []
for word, value in zip(pd.DataFrame(corpus).value_counts().index,
pd.DataFrame(corpus).value_counts()) :
words.append(word[0])
values.append(value)
len_words.append(len(word[0]))
res = pd.DataFrame({'words' : words,
'values' : values,
'len_words' : len_words})
res = res.sort_values(by = 'values', ascending = False)
#plt.figure(figsize = [width, height])
ax = ax
ax.set_title(title)
sns.barplot(data = res[range1:range2],
y = 'words',
x = 'values',
color = color,
ax = ax)
for index, value in enumerate(res['values'].iloc[range1:range2]) :
plt.text(value, index, value)
#plt.show()
return ax
stopwords = ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti',
'jika', 'jika', 'sehingga', 'kembali', 'dan', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'setelah', 'kami',
'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa',
'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara',
'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka',
'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun',
'sampai', 'sedangkan', 'selagi', 'sementara', 'apakah', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya',
'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'ingin', 'juga', 'nggak',
'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', 'setiap', 'setidaknya', 'sesuatu', 'pasti',
'saja', 'toh', 'ya', 'walau', 'tolong', 'tentu', 'amat', 'apalagi', 'bagaimanapun', 'sekali', 'jadi', 'nya']
keep_stopwords = ['tidak', 'sementara', 'belum', 'tetapi', 'kecuali', 'tapi', 'ada', 'tanpa', 'nggak', 'ok',
'hanya', 'kurang']
fix_stopwords = [next if word in keep_stopwords else word for word in stopwords]
words_dict = {
'tdk' : 'tidak',
'yg' : 'yang',
'ga' : 'tidak',
'gak' : 'tidak',
'tp' : 'tapi',
'd' : 'di',
'sy' : 'saya',
'&' : 'dan',
'dgn' : 'dengan',
'utk' : 'untuk',
'gk' : 'tidak',
'jd' : 'jadi',
'jg' : 'juga',
'dr' : 'dari',
'krn' : 'karena',
'aja' : 'saja',
'karna' : 'karena',
'udah' : 'sudah',
'kmr' : 'kamar',
'g' : 'tidak',
'dpt' : 'dapat',
'banget' : 'sekali',
'bgt' : 'sekali',
'kalo' : 'kalau',
'n' : 'dan',
'bs' : 'bisa',
'oke' : 'ok',
'dg' : 'dengan',
'pake' : 'pakai',
'sampe' : 'sampai',
'dapet' : 'dapat',
'ad' : 'ada',
'lg' : 'lagi',
'bikin' : 'buat',
'tak' : 'tidak',
'ny' : 'nya',
'ngga' : 'tidak',
'nunggu' : 'tunggu',
'klo' : 'kalau',
'blm' : 'belum',
'trus' : 'terus',
'kayak' : 'seperti',
'dlm' : 'dalam',
'udh' : 'sudah',
'tau' : 'tahu',
'org' : 'orang',
'hrs' : 'harus',
'msh' : 'masih',
'sm' : 'sama',
'byk' : 'banyak',
'krg' : 'kurang',
'kmar' : 'kamar',
'spt' : 'seperti',
'pdhl' : 'padahal',
'chek' : 'cek',
'pesen' : 'pesan',
'kran' : 'keran',
'gitu' : 'begitu',
'tpi' : 'tapi',
'lbh' : 'lebih',
'tmpt' : 'tempat',
'dikasi' : 'dikasih',
'serem' : 'seram',
'sya' : 'saya',
'jgn' : 'jangan',
'dri' : 'dari',
'dtg' : 'datang',
'gada' : 'tidak ada',
'standart' : 'standar',
'mlm' : 'malam',
'k' : 'ke',
'kl' : 'kalau',
'sgt': 'sangat',
'y' : 'ya',
'krna' : 'karena',
'tgl' : 'tanggal',
'terimakasih' : 'terima kasih',
'kecoak' : 'kecoa',
'pd' : 'pada',
'tdr' : 'tidur',
'jdi' : 'jadi',
'kyk' : 'seperti',
'sdh' : 'sudah',
'ama' : 'sama',
'gmana' : 'bagaimana',
'dalem' : 'dalam',
'tanyak' : 'tanya',
'taru' : 'taruh',
'gede' : 'besar',
'kaya' : 'seperti',
'access' : 'akses',
'tetep' : 'tetap',
'mgkin' : 'mungkin',
'sower' : 'shower',
'idup' : 'hidup',
'nyaaa' : 'nya',
'baikk' : 'baik',
'hanay' : 'hanya',
'tlp' : 'telpon',
'kluarga' : 'keluarga',
'jln' : 'jalan',
'hr' : 'hari',
'ngak' : 'tidak',
'bli' : 'beli',
'kmar' : 'kamar',
'naro' : 'taruh'
train['review_text_cleaned'] = [i.lower() for i in train['review_text']]
test['review_text_cleaned'] = [i.lower() for i in test['review_text']]
# removing \x00 characters
train['review_text_cleaned'] = [re.sub(r'[^\x00-\x7f]',r'', i) for i in train['review_text_cleaned']]
test['review_text_cleaned'] = [re.sub(r'[^\x00-\x7f]',r'', i) for i in test['review_text_cleaned']]
#r removing \n
train['review_text_cleaned'] = [re.sub(r'\n', r' ', i) for i in train['review_text_cleaned']]
test['review_text_cleaned'] = [re.sub(r'\n', r' ', i) for i in test['review_text_cleaned']]
#r removing numbers
train['review_text_cleaned'] = [re.sub(r"\d+", r"", i) for i in train['review_text_cleaned']]
test['review_text_cleaned'] = [re.sub(r"\d+", r"", i) for i in test['review_text_cleaned']]
#removing punctuation
train['review_text_cleaned'] = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
for i in train['review_text_cleaned']]
test['review_text_cleaned'] = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
for i in test['review_text_cleaned']]
def num_count(data, feature) :
#kompas_list = list(set(kompas['words']))
num_list = []
for sentence in tqdm(data[feature]) :
num = 0
for word in sentence :
if word.isdigit() :
num += 1
else :
num += 0
num_list.append(num)
return num_list
train['num_of_space'] = [i.count(' ') for i in train['review_text_cleaned']]
test['num_of_space'] = [i.count(' ') for i in test['review_text_cleaned']]
train['num_of_words'] = [len(i.split()) for i in train['review_text_cleaned']]
test['num_of_words'] = [len(i.split()) for i in test['review_text_cleaned']]
train['num_unique_char'] = [len(set(i)) for i in train['review_text_cleaned']]
test['num_unique_char'] = [len(set(i)) for i in test['review_text_cleaned']]
train['len_string_initial'] = [len(i) for i in train['review_text']]
test['len_string_initial'] = [len(i) for i in test['review_text']]
train['num_of_numeric'] = num_count(train, 'review_text')
test['num_of_numeric'] = num_count(test, 'review_text')
list_sentence_train = []
for sentence in tqdm(train['review_text_cleaned']) :
cleaned_sentence = [words_dict[word] if word in list(words_dict.keys()) else word for word in
sentence.split()]
list_sentence_train.append(' '.join(cleaned_sentence))
train['review_text_cleaned'] = list_sentence_train
list_sentence_test = []
for sentence in tqdm(test['review_text_cleaned']) :
cleaned_sentence = [words_dict[word] if word in list(words_dict.keys()) else word for word in
sentence.split()]
list_sentence_test.append(' '.join(cleaned_sentence))
test['review_text_cleaned'] = list_sentence_test
#removing indonesian stopwords
train['review_text_cleaned_nostopwords'] = remove_indonesian_stop(train, 'review_text_cleaned')
test['review_text_cleaned_nostopwords'] = remove_indonesian_stop(test, 'review_text_cleaned')
# Ubah DataFrame menjadi list of dictionaries
tweet_list = train.to_dict('records')
hasil_analisis = []
for tweet in tweet_list:
tweet_properties = {}
tweet_bersih = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","
",tweet["review_text"]).split())
analysis = TextBlob(tweet_bersih)
if analysis.sentiment.polarity > 0.0:
tweet_properties["category"] = 'positive'
elif analysis.sentiment.polarity < 0.0:
tweet_properties["category"] = 'negative'
else:
tweet_properties["category"] = 'neutral'
hasil_analisis.append(tweet_properties)
hasil_analisis_df = pd.DataFrame(hasil_analisis)
# Gabungkan kolom 'category' dari hasil_analisis_df ke train berdasarkan indeks
train = pd.concat([train, hasil_analisis_df['category']], axis=1)
# Ubah DataFrame menjadi list of dictionaries
tweet_list = test.to_dict('records')
hasil_analisis = []
for tweet in tweet_list:
tweet_properties = {}
tweet_bersih = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","
",tweet["review_text"]).split())
analysis = TextBlob(tweet_bersih)
if analysis.sentiment.polarity > 0.0:
tweet_properties["category"] = 'positive'
elif analysis.sentiment.polarity < 0.0:
tweet_properties["category"] = 'negative'
else:
tweet_properties["category"] = 'neutral'
hasil_analisis.append(tweet_properties)
hasil_analisis_df = pd.DataFrame(hasil_analisis)
# Gabungkan kolom 'category' dari hasil_analisis_df ke train berdasarkan indeks
test = pd.concat([test, hasil_analisis_df['category']], axis=1)
# Sampel ukuran dan data
sample_size = 100
sample_train = train.sample(n=sample_size, random_state=42)
# Plot konfigurasi
plt.figure(figsize=(10, 5), dpi=90) # Gunakan tuple untuk figsize
sns.set_style('whitegrid')
# Membuat countplot
ax = sns.countplot(
x='category',
data=sample_train,
palette=['#F42E56', '#808080', '#31B057']
# Mengganti label sumbu x
ax.set_xticklabels(['Positive', 'Netral', 'Negative'])
# Menghitung persentase dan menambahkan label
total = len(sample_train)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height() / total)
x = p.get_x() + p.get_width() / 2 # Posisi x di tengah bar
y = p.get_height() # Posisi y di atas bar
ax.annotate(percentage, (x, y), ha='center', va='bottom')
# Menambahkan judul
plt.title('Class Distribution Plot (Sample)', fontsize=14)
# Menampilkan plot
plt.show()
plt.figure(figsize = [15, 5], dpi = 90)
sns.distplot(train[train['category'] == 'negative']['len_string_initial'],
color = '#F42E56',
label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']
['len_string_initial'].mean()))
sns.distplot(train[train['category'] == 'neutral']['len_string_initial'],
color = '#808080',
label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']
['len_string_initial'].mean()))
sns.distplot(train[train['category'] == 'positive']['len_string_initial'],
color = '#31B057',
label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']
['len_string_initial'].mean()))
plt.title('Distplot Comparison Based On Length of Characters')
plt.legend()
plt.figure(figsize = [15, 5], dpi = 90)
sns.distplot(train[train['category'] == 'negative']['num_of_words'],
color = '#F42E56',
label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']
['num_of_words'].mean()))
sns.distplot(train[train['category'] == 'neutral']['num_of_words'],
color = '#808080',
label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']
['num_of_words'].mean()))
sns.distplot(train[train['category'] == 'positive']['num_of_words'],
color = '#31B057',
label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']
['num_of_words'].mean()))
plt.title('Distplot Comparison Based on Number of Words')
plt.legend()
plt.figure(figsize = [15, 5], dpi = 90)
sns.distplot(train[train['category'] == 'negative']['num_unique_char'],
color = '#F42E56',
label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']
['num_unique_char'].mean()))
sns.distplot(train[train['category'] == 'neutral']['num_unique_char'],
color = '#808080',
label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']
['num_unique_char'].mean()))
sns.distplot(train[train['category'] == 'positive']['num_unique_char'],
color = '#31B057',
label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']
['num_unique_char'].mean()))
plt.title('Distplot Comparison Based on Unique Characters')
plt.legend()
# Menghapus baris dengan nilai NaN pada 'review_text_cleaned'
train = train.dropna()
fig = plt.figure(figsize = [15, 15])
ax1 = fig.add_subplot(1,2,1)
#class_distribution("colname1", ax=ax1)
plot_unique_word_count(create_corpus(train[train['category'] == 'negative'],"review_text_cleaned"),
15,
18,
0,
50,
'Negative Sentiment Unigram (with all stopwords) \n',
'#F42E56', ax = ax1)
ax2 = fig.add_subplot(1,2,2)
plot_unique_word_count(create_corpus(train[train['category'] == 'negative'],
'review_text_cleaned_nostopwords'),
5,
5,
0,
50,
'Negative Sentiment Unigram (keeping some stopwords) \n',
'#F42E56', ax = ax2)
fig = plt.figure(figsize = [15, 15])
ax1 = fig.add_subplot(1,2,1)
#class_distribution("colname1", ax=ax1)
plot_unique_word_count(create_corpus(train[train['category'] == 'positive'], 'review_text_cleaned'),
15,
18,
0,
50,
'Positive Sentiment Unigram (keeping some stopwords) \n',
'#31B057', ax = ax1)
ax2 = fig.add_subplot(1,2,2)
plot_unique_word_count(create_corpus(train[train['category'] == 'positive'],
'review_text_cleaned_nostopwords'),
5,
5,
0,
50,
'Positive Sentiment Unigram (keeping some stopwords) \n',
'#31B057', ax = ax2)
from sklearn.feature_extraction.text import CountVectorizer
def get_top_tweet_bigrams(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
fig = plt.figure(figsize=(18,18))
top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]
x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 1)
sns.barplot(x=y,y=x, color = '#F42E56')
plt.title('Negative Sentiment Bigrams \n')
index = 0
for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])
index+=1
top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]
x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 2)
sns.barplot(x=y,y=x, color = '#31B057')
plt.title('Positive Sentiment Bigrams \n')
index = 0
for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])
index+=1
from sklearn.feature_extraction.text import CountVectorizer
def get_top_tweet_bigrams(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
fig = plt.figure(figsize=(18,18))
top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]
x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 1)
sns.barplot(x=y,y=x, color = '#F42E56')
plt.title('Negative Sentiment Bigrams \n')
index = 0
for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])
index+=1
top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]
x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 2)
sns.barplot(x=y,y=x, color = '#31B057')
plt.title('Positive Sentiment Bigrams \n')
index = 0
for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])
index+=1
from sklearn.feature_extraction.text import CountVectorizer
def get_top_tweet_bigrams(corpus, n=None):
vec = CountVectorizer(ngram_range=(4, 4)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
fig = plt.figure(figsize=(18,18))
top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]
x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 1)
sns.barplot(x=y,y=x, color = '#F42E56')
plt.title('Negative Sentiment Bigrams \n')
index = 0
for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])
index+=1
top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]
x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 2)
sns.barplot(x=y,y=x, color = '#31B057')
plt.title('Positive Sentiment Bigrams \n')
index = 0
for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])
index+=1
#####
# Document Sentiment
#####
class TextDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encodings = self.tokenizer(
text,
truncation=True,
padding="max_length",
max_length=self.max_length,
return_tensors="pt",
return {
"input_ids": encodings["input_ids"].squeeze(),
"attention_mask": encodings["attention_mask"].squeeze(),
"label": torch.tensor(label, dtype=torch.long),
class DocumentSentimentDataset(Dataset):
# Static constant variable
LABEL2INDEX = {'positive': 1, 'negative': 0, 'neutral': 2} # Added 'neutral': 2
INDEX2LABEL = {1: 'positive', 0: 'negative', 2: 'neutral'} # Added 2: 'neutral'
NUM_LABELS = 3 # Updated to 3
def load_dataset(self, path):
df = pd.read_csv(path)
return df
def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):
self.data = self.load_dataset(dataset_path)
self.tokenizer = tokenizer
self.no_special_token = no_special_token
def __getitem__(self, index):
data = self.data.loc[index,:]
text, sentiment = data['review_text'], data['category']
subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
# Convert sentiment to string before returning
sentiment = str(sentiment) # Assuming sentiment is a single value
return np.array(subwords), np.array(sentiment), data['review_text']
def __len__(self):
return len(self.data)
class DocumentSentimentDataLoader(DataLoader):
def __init__(self, max_seq_len=512, *args, **kwargs):
super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
self.collate_fn = self._collate_fn
self.max_seq_len = max_seq_len
def _collate_fn(self, batch):
batch_size = len(batch)
max_seq_len = max(map(lambda x: len(x[0]), batch))
max_seq_len = min(self.max_seq_len, max_seq_len)
subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
seq_list = []
for i, (subwords, sentiment, raw_seq) in enumerate(batch):
subwords = subwords[:max_seq_len]
subword_batch[i,:len(subwords)] = subwords
mask_batch[i,:len(subwords)] = 1
# Convert sentiment label to numerical representation using LABEL2INDEX
sentiment_batch[i,0] = DocumentSentimentDataset.LABEL2INDEX.get(sentiment, -1) # -1 for
unknown labels
seq_list.append(raw_seq)
return subword_batch, mask_batch, sentiment_batch, seq_list
# Forward function for sequence classification
def forward_sequence_classification(model, batch_data):
# Unpack batch data
batch_input_ids, batch_attention_mask, batch_labels = batch_data
# Forward pass
outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask,
labels=batch_labels)
loss = outputs.loss
logits = outputs.logits
# Convert predictions and labels to lists
batch_hyp = torch.argmax(logits, dim=1).tolist()
batch_label = batch_labels.tolist()
return loss, batch_hyp, batch_label
def document_sentiment_metrics_fn(list_hyp, list_label):
metrics = {}
metrics["ACC"] = accuracy_score(list_label, list_hyp)
metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
return metrics
X_train, X_val, y_train, y_val, indices_train, indices_val = train_test_split(train["review_text"],
train['category'], train.index, stratify = train["category"],
test_size=0.25, random_state=50)
pd.concat([X_train,y_train],axis=1).reset_index(drop=True).to_csv("train_1.csv",index=False)
pd.concat([X_val,y_val],axis=1).reset_index(drop=True).to_csv("validation_1.csv",index=False)
pd.concat([test[["review_text"]],test[['category']]],
axis=1).reset_index(drop=True).to_csv("test.csv",index=False)
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS
# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1',
config=config)
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)
texts = X_train.tolist()
sentimen_analysis_labels = []
# Perform sentiment analysis
for text in texts:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')
sentimen_analysis_labels.append(label)
def objective(trial):
# Define the hyperparameters to optimize
learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)
# Create the optimizer with the suggested hyperparameters
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# Training loop
# ... your training loop here ...
model.train()
train_subset = list(zip(X_train, y_train))
for epoch in range(5): # Adjust number of epochs
for text, label in train_subset:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)
optimizer.zero_grad()
outputs = model(subwords, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# Calculate and return the evaluation metric
# Example: Calculate F1-score on a small validation set
val_subset = list(zip(X_val, y_val))
correct_predictions = 0
total_predictions = 0
model.eval()
for text, label in val_subset:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
logits = model(subwords)[0]
predicted_label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
if i2w[predicted_label] == label:
correct_predictions +=1
total_predictions+=1
f1 = correct_predictions/total_predictions
return f1
# Create an Optuna study
study = optuna.create_study(direction="maximize")
# Run the optimization
study.optimize(objective, n_trials=3)
# Print the best trial
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)
from optuna.visualization import plot_parallel_coordinate
plot_parallel_coordinate(study).show()
from optuna.visualization import plot_optimization_history
plot_optimization_history(study).show()
optimizer = torch.optim.AdamW(model.parameters(), lr=study.best_params['learning_rate'],
weight_decay=study.best_params['weight_decay'])
# Store losses for plotting
train_losses = []
val_losses = []
# Training loop
# ... your training loop here ...
model.train()
train_subset = list(zip(X_train, y_train))
for epoch in range(5): # Adjust number of epochs
epoch_train_loss = 0.0
for text, label in train_subset:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)
optimizer.zero_grad()
outputs = model(subwords, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
epoch_train_loss += loss.item()
# Average training loss for the epoch
avg_train_loss = epoch_train_loss / len(train_subset)
train_losses.append(avg_train_loss)
print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")
# Calculate and return the evaluation metric
# Example: Calculate F1-score on a small validation set
val_subset = list(zip(X_val, y_val))
correct_predictions = 0
total_predictions = 0
model.eval()
val_loss = 0.0
for text, label in val_subset:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)
outputs = model(subwords, labels=labels)
val_loss += outputs.loss.item()
# Average validation loss for the epoch
avg_val_loss = val_loss / len(val_subset)
val_losses.append(avg_val_loss)
print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")
# Plot train and validation losses
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Train and Validation Loss")
plt.legend()
plt.grid(True)
plt.show()
# Evaluate on train data
texts = X_train.tolist()
sentimen_analysis_tuned_labels = []
# Perform sentiment analysis
for text in texts:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')
sentimen_analysis_tuned_labels.append(label)
# Evaluate on test
test = test.dropna()
test_texts = test['review_text'].tolist()
test_sentimen_analysis_labels = []
# Perform sentiment analysis
for text in test_texts:
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')
test_sentimen_analysis_labels.append(label)
pred = pd.DataFrame({'label':test_sentimen_analysis_labels}).reset_index()
pred.to_csv('pred.csv', index=False)
# Create a DataFrame from the collected labels
train_result_df = pd.DataFrame({
'label': y_train.tolist(),
'sentimen_analysis': sentimen_analysis_labels,
'sentimen_analysis_tuned': sentimen_analysis_tuned_labels
})
# Concatenate the labels DataFrame with your original DataFrame (e.g. X_train)
X_train_df = X_train.to_frame()
train_results_df = pd.concat([X_train_df, train_result_df], axis=1)
print(train_results_df.head())
train_results_df.to_csv('train_results.csv', index=False)
# Mapping nilai kategori ke numerik
label_mapping = {'positive': 1, 'neutral': 2, 'negative': 0}
# Mengonversi kolom menjadi list dengan map
train_results_df = train_results_df.dropna()
true_label = train_results_df['label'].map(label_mapping).tolist()
# Calculate metrics for the non-optimized model
metrics_no_optimization = document_sentiment_metrics_fn(train_results_df['sentimen_analysis'],
true_label)
print("Metrics (No Optimization):", metrics_no_optimization)
# Calculate metrics for the Bayesian-optimized model
metrics_bayesian = document_sentiment_metrics_fn(train_results_df['sentimen_analysis_tuned'],
true_label)
print("Metrics (Bayesian Optimization):", metrics_bayesian)
# Create a DataFrame from the collected labels
test_results_df = pd.DataFrame({
'review_text': test['review_text'].tolist(),
'label': test['category'].tolist(),
'sentimen_analysis': pred['label']
})
#test_results_df = test_results_df.dropna()
print(test_results_df.head())
# Mapping nilai kategori ke numerik
label_mapping = {'positive': 1, 'neutral': 2, 'negative': 0}
# Mengonversi kolom menjadi list dengan map
true_test_label = test_results_df['label'].map(label_mapping).tolist()
# Calculate metrics for the Bayesian-optimized model
test_metrics_bayesian = document_sentiment_metrics_fn(test_results_df['sentimen_analysis'],
true_test_label)
print("Metrics (Bayesian Optimization) on test data:", test_metrics_bayesian)
jelaskan langkah langkah yang saya harus masukkan di metodologi penelitian,, serta jelaskan tahapan
"hasil dan pembahasan" berdasarkan coding