0% found this document useful (0 votes)
7 views35 pages

Coding

The document outlines a data processing workflow for analyzing text reviews using Python's pandas and other libraries. It includes steps for cleaning the text data, removing stopwords, counting words and characters, and performing sentiment analysis using TextBlob. Finally, it visualizes the sentiment distribution and character length comparisons across different sentiment categories.

Uploaded by

Ummu Salama
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views35 pages

Coding

The document outlines a data processing workflow for analyzing text reviews using Python's pandas and other libraries. It includes steps for cleaning the text data, removing stopwords, counting words and characters, and performing sentiment analysis using TextBlob. Finally, it visualizes the sentiment distribution and character length comparisons across different sentiment categories.

Uploaded by

Ummu Salama
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 35

text = pd.

DataFrame({'review_text': data['text']})

text.head()

train, test = train_test_split(text,test_size=0.20, random_state=50)

# Cek apakah ada data null (missing values) di dataset

print("Jumlah missing values pada dataset train:")

print(train.isnull().sum())

print("\nJumlah missing values pada dataset test:")

print(test.isnull().sum())

def create_corpus(data, var):

corpus=[]

for x in data[var].str.split():

for i in x:

corpus.append(i)

return corpus

def remove_indonesian_stop(data, feature) :

stopwords = fix_stopwords

filtered = []

for text in data[feature] :

text_list = []
for word in text.split() :

if word not in stopwords :

text_list.append(word)

text_list = ' '.join(text_list)

filtered.append(text_list)

return filtered

def num_count(data, feature) :

#kompas_list = list(set(kompas['words']))

num_list = []

for sentence in tqdm(data[feature]) :

num = 0

for word in sentence :

if word.isdigit() :

num += 1

else :

num += 0

num_list.append(num)

return num_list

def plot_unique_word_count(corpus, width, height, range1, range2, title, color, ax = None) :

words = []

values = []

len_words = []

for word, value in zip(pd.DataFrame(corpus).value_counts().index,


pd.DataFrame(corpus).value_counts()) :
words.append(word[0])

values.append(value)

len_words.append(len(word[0]))

res = pd.DataFrame({'words' : words,

'values' : values,

'len_words' : len_words})

res = res.sort_values(by = 'values', ascending = False)

#plt.figure(figsize = [width, height])

ax = ax

ax.set_title(title)

sns.barplot(data = res[range1:range2],

y = 'words',

x = 'values',

color = color,

ax = ax)

for index, value in enumerate(res['values'].iloc[range1:range2]) :

plt.text(value, index, value)

#plt.show()

return ax

stopwords = ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti',
'jika', 'jika', 'sehingga', 'kembali', 'dan', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'setelah', 'kami',
'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa',
'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara',
'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka',
'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun',
'sampai', 'sedangkan', 'selagi', 'sementara', 'apakah', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya',
'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'ingin', 'juga', 'nggak',
'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', 'setiap', 'setidaknya', 'sesuatu', 'pasti',
'saja', 'toh', 'ya', 'walau', 'tolong', 'tentu', 'amat', 'apalagi', 'bagaimanapun', 'sekali', 'jadi', 'nya']

keep_stopwords = ['tidak', 'sementara', 'belum', 'tetapi', 'kecuali', 'tapi', 'ada', 'tanpa', 'nggak', 'ok',
'hanya', 'kurang']

fix_stopwords = [next if word in keep_stopwords else word for word in stopwords]

words_dict = {

'tdk' : 'tidak',

'yg' : 'yang',

'ga' : 'tidak',

'gak' : 'tidak',

'tp' : 'tapi',

'd' : 'di',

'sy' : 'saya',

'&' : 'dan',

'dgn' : 'dengan',

'utk' : 'untuk',

'gk' : 'tidak',

'jd' : 'jadi',

'jg' : 'juga',

'dr' : 'dari',

'krn' : 'karena',

'aja' : 'saja',

'karna' : 'karena',

'udah' : 'sudah',

'kmr' : 'kamar',
'g' : 'tidak',

'dpt' : 'dapat',

'banget' : 'sekali',

'bgt' : 'sekali',

'kalo' : 'kalau',

'n' : 'dan',

'bs' : 'bisa',

'oke' : 'ok',

'dg' : 'dengan',

'pake' : 'pakai',

'sampe' : 'sampai',

'dapet' : 'dapat',

'ad' : 'ada',

'lg' : 'lagi',

'bikin' : 'buat',

'tak' : 'tidak',

'ny' : 'nya',

'ngga' : 'tidak',

'nunggu' : 'tunggu',

'klo' : 'kalau',

'blm' : 'belum',

'trus' : 'terus',

'kayak' : 'seperti',

'dlm' : 'dalam',

'udh' : 'sudah',
'tau' : 'tahu',

'org' : 'orang',

'hrs' : 'harus',

'msh' : 'masih',

'sm' : 'sama',

'byk' : 'banyak',

'krg' : 'kurang',

'kmar' : 'kamar',

'spt' : 'seperti',

'pdhl' : 'padahal',

'chek' : 'cek',

'pesen' : 'pesan',

'kran' : 'keran',

'gitu' : 'begitu',

'tpi' : 'tapi',

'lbh' : 'lebih',

'tmpt' : 'tempat',

'dikasi' : 'dikasih',

'serem' : 'seram',

'sya' : 'saya',

'jgn' : 'jangan',

'dri' : 'dari',

'dtg' : 'datang',

'gada' : 'tidak ada',

'standart' : 'standar',
'mlm' : 'malam',

'k' : 'ke',

'kl' : 'kalau',

'sgt': 'sangat',

'y' : 'ya',

'krna' : 'karena',

'tgl' : 'tanggal',

'terimakasih' : 'terima kasih',

'kecoak' : 'kecoa',

'pd' : 'pada',

'tdr' : 'tidur',

'jdi' : 'jadi',

'kyk' : 'seperti',

'sdh' : 'sudah',

'ama' : 'sama',

'gmana' : 'bagaimana',

'dalem' : 'dalam',

'tanyak' : 'tanya',

'taru' : 'taruh',

'gede' : 'besar',

'kaya' : 'seperti',

'access' : 'akses',

'tetep' : 'tetap',

'mgkin' : 'mungkin',

'sower' : 'shower',
'idup' : 'hidup',

'nyaaa' : 'nya',

'baikk' : 'baik',

'hanay' : 'hanya',

'tlp' : 'telpon',

'kluarga' : 'keluarga',

'jln' : 'jalan',

'hr' : 'hari',

'ngak' : 'tidak',

'bli' : 'beli',

'kmar' : 'kamar',

'naro' : 'taruh'

train['review_text_cleaned'] = [i.lower() for i in train['review_text']]

test['review_text_cleaned'] = [i.lower() for i in test['review_text']]

# removing \x00 characters

train['review_text_cleaned'] = [re.sub(r'[^\x00-\x7f]',r'', i) for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [re.sub(r'[^\x00-\x7f]',r'', i) for i in test['review_text_cleaned']]

#r removing \n

train['review_text_cleaned'] = [re.sub(r'\n', r' ', i) for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [re.sub(r'\n', r' ', i) for i in test['review_text_cleaned']]


#r removing numbers

train['review_text_cleaned'] = [re.sub(r"\d+", r"", i) for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [re.sub(r"\d+", r"", i) for i in test['review_text_cleaned']]

#removing punctuation

train['review_text_cleaned'] = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))


for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))


for i in test['review_text_cleaned']]

def num_count(data, feature) :

#kompas_list = list(set(kompas['words']))

num_list = []

for sentence in tqdm(data[feature]) :

num = 0

for word in sentence :

if word.isdigit() :

num += 1

else :

num += 0

num_list.append(num)

return num_list

train['num_of_space'] = [i.count(' ') for i in train['review_text_cleaned']]

test['num_of_space'] = [i.count(' ') for i in test['review_text_cleaned']]


train['num_of_words'] = [len(i.split()) for i in train['review_text_cleaned']]

test['num_of_words'] = [len(i.split()) for i in test['review_text_cleaned']]

train['num_unique_char'] = [len(set(i)) for i in train['review_text_cleaned']]

test['num_unique_char'] = [len(set(i)) for i in test['review_text_cleaned']]

train['len_string_initial'] = [len(i) for i in train['review_text']]

test['len_string_initial'] = [len(i) for i in test['review_text']]

train['num_of_numeric'] = num_count(train, 'review_text')

test['num_of_numeric'] = num_count(test, 'review_text')

list_sentence_train = []

for sentence in tqdm(train['review_text_cleaned']) :

cleaned_sentence = [words_dict[word] if word in list(words_dict.keys()) else word for word in


sentence.split()]

list_sentence_train.append(' '.join(cleaned_sentence))

train['review_text_cleaned'] = list_sentence_train

list_sentence_test = []

for sentence in tqdm(test['review_text_cleaned']) :

cleaned_sentence = [words_dict[word] if word in list(words_dict.keys()) else word for word in


sentence.split()]

list_sentence_test.append(' '.join(cleaned_sentence))

test['review_text_cleaned'] = list_sentence_test
#removing indonesian stopwords

train['review_text_cleaned_nostopwords'] = remove_indonesian_stop(train, 'review_text_cleaned')

test['review_text_cleaned_nostopwords'] = remove_indonesian_stop(test, 'review_text_cleaned')

# Ubah DataFrame menjadi list of dictionaries

tweet_list = train.to_dict('records')

hasil_analisis = []

for tweet in tweet_list:

tweet_properties = {}

tweet_bersih = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","


",tweet["review_text"]).split())

analysis = TextBlob(tweet_bersih)

if analysis.sentiment.polarity > 0.0:

tweet_properties["category"] = 'positive'

elif analysis.sentiment.polarity < 0.0:

tweet_properties["category"] = 'negative'

else:

tweet_properties["category"] = 'neutral'

hasil_analisis.append(tweet_properties)

hasil_analisis_df = pd.DataFrame(hasil_analisis)
# Gabungkan kolom 'category' dari hasil_analisis_df ke train berdasarkan indeks

train = pd.concat([train, hasil_analisis_df['category']], axis=1)

# Ubah DataFrame menjadi list of dictionaries

tweet_list = test.to_dict('records')

hasil_analisis = []

for tweet in tweet_list:

tweet_properties = {}

tweet_bersih = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","


",tweet["review_text"]).split())

analysis = TextBlob(tweet_bersih)

if analysis.sentiment.polarity > 0.0:

tweet_properties["category"] = 'positive'

elif analysis.sentiment.polarity < 0.0:

tweet_properties["category"] = 'negative'

else:

tweet_properties["category"] = 'neutral'

hasil_analisis.append(tweet_properties)

hasil_analisis_df = pd.DataFrame(hasil_analisis)
# Gabungkan kolom 'category' dari hasil_analisis_df ke train berdasarkan indeks

test = pd.concat([test, hasil_analisis_df['category']], axis=1)

# Sampel ukuran dan data

sample_size = 100

sample_train = train.sample(n=sample_size, random_state=42)

# Plot konfigurasi

plt.figure(figsize=(10, 5), dpi=90) # Gunakan tuple untuk figsize

sns.set_style('whitegrid')

# Membuat countplot

ax = sns.countplot(

x='category',

data=sample_train,

palette=['#F42E56', '#808080', '#31B057']

# Mengganti label sumbu x

ax.set_xticklabels(['Positive', 'Netral', 'Negative'])

# Menghitung persentase dan menambahkan label

total = len(sample_train)

for p in ax.patches:

percentage = '{:.1f}%'.format(100 * p.get_height() / total)


x = p.get_x() + p.get_width() / 2 # Posisi x di tengah bar

y = p.get_height() # Posisi y di atas bar

ax.annotate(percentage, (x, y), ha='center', va='bottom')

# Menambahkan judul

plt.title('Class Distribution Plot (Sample)', fontsize=14)

# Menampilkan plot

plt.show()

plt.figure(figsize = [15, 5], dpi = 90)

sns.distplot(train[train['category'] == 'negative']['len_string_initial'],

color = '#F42E56',

label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']


['len_string_initial'].mean()))

sns.distplot(train[train['category'] == 'neutral']['len_string_initial'],

color = '#808080',

label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']


['len_string_initial'].mean()))

sns.distplot(train[train['category'] == 'positive']['len_string_initial'],

color = '#31B057',

label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']


['len_string_initial'].mean()))

plt.title('Distplot Comparison Based On Length of Characters')

plt.legend()

plt.figure(figsize = [15, 5], dpi = 90)


sns.distplot(train[train['category'] == 'negative']['num_of_words'],

color = '#F42E56',

label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']


['num_of_words'].mean()))

sns.distplot(train[train['category'] == 'neutral']['num_of_words'],

color = '#808080',

label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']


['num_of_words'].mean()))

sns.distplot(train[train['category'] == 'positive']['num_of_words'],

color = '#31B057',

label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']


['num_of_words'].mean()))

plt.title('Distplot Comparison Based on Number of Words')

plt.legend()

plt.figure(figsize = [15, 5], dpi = 90)

sns.distplot(train[train['category'] == 'negative']['num_unique_char'],

color = '#F42E56',

label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']


['num_unique_char'].mean()))

sns.distplot(train[train['category'] == 'neutral']['num_unique_char'],

color = '#808080',

label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']


['num_unique_char'].mean()))

sns.distplot(train[train['category'] == 'positive']['num_unique_char'],

color = '#31B057',

label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']


['num_unique_char'].mean()))
plt.title('Distplot Comparison Based on Unique Characters')

plt.legend()

# Menghapus baris dengan nilai NaN pada 'review_text_cleaned'

train = train.dropna()

fig = plt.figure(figsize = [15, 15])

ax1 = fig.add_subplot(1,2,1)

#class_distribution("colname1", ax=ax1)

plot_unique_word_count(create_corpus(train[train['category'] == 'negative'],"review_text_cleaned"),

15,

18,

0,

50,

'Negative Sentiment Unigram (with all stopwords) \n',

'#F42E56', ax = ax1)

ax2 = fig.add_subplot(1,2,2)

plot_unique_word_count(create_corpus(train[train['category'] == 'negative'],
'review_text_cleaned_nostopwords'),

5,

5,

0,

50,

'Negative Sentiment Unigram (keeping some stopwords) \n',


'#F42E56', ax = ax2)

fig = plt.figure(figsize = [15, 15])

ax1 = fig.add_subplot(1,2,1)

#class_distribution("colname1", ax=ax1)

plot_unique_word_count(create_corpus(train[train['category'] == 'positive'], 'review_text_cleaned'),

15,

18,

0,

50,

'Positive Sentiment Unigram (keeping some stopwords) \n',

'#31B057', ax = ax1)

ax2 = fig.add_subplot(1,2,2)

plot_unique_word_count(create_corpus(train[train['category'] == 'positive'],
'review_text_cleaned_nostopwords'),

5,

5,

0,

50,

'Positive Sentiment Unigram (keeping some stopwords) \n',

'#31B057', ax = ax2)

from sklearn.feature_extraction.text import CountVectorizer

def get_top_tweet_bigrams(corpus, n=None):


vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

fig = plt.figure(figsize=(18,18))

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 1)

sns.barplot(x=y,y=x, color = '#F42E56')

plt.title('Negative Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 2)

sns.barplot(x=y,y=x, color = '#31B057')

plt.title('Positive Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])

index+=1

from sklearn.feature_extraction.text import CountVectorizer

def get_top_tweet_bigrams(corpus, n=None):

vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

fig = plt.figure(figsize=(18,18))

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 1)

sns.barplot(x=y,y=x, color = '#F42E56')

plt.title('Negative Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 2)

sns.barplot(x=y,y=x, color = '#31B057')

plt.title('Positive Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

from sklearn.feature_extraction.text import CountVectorizer

def get_top_tweet_bigrams(corpus, n=None):

vec = CountVectorizer(ngram_range=(4, 4)).fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

fig = plt.figure(figsize=(18,18))

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 1)

sns.barplot(x=y,y=x, color = '#F42E56')

plt.title('Negative Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])


index+=1

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 2)

sns.barplot(x=y,y=x, color = '#31B057')

plt.title('Positive Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

#####

# Document Sentiment

#####

class TextDataset(Dataset):

def __init__(self, texts, labels, tokenizer, max_length=128):

self.texts = texts

self.labels = labels

self.tokenizer = tokenizer

self.max_length = max_length

def __len__(self):

return len(self.texts)

def __getitem__(self, idx):


text = self.texts[idx]

label = self.labels[idx]

encodings = self.tokenizer(

text,

truncation=True,

padding="max_length",

max_length=self.max_length,

return_tensors="pt",

return {

"input_ids": encodings["input_ids"].squeeze(),

"attention_mask": encodings["attention_mask"].squeeze(),

"label": torch.tensor(label, dtype=torch.long),

class DocumentSentimentDataset(Dataset):

# Static constant variable

LABEL2INDEX = {'positive': 1, 'negative': 0, 'neutral': 2} # Added 'neutral': 2

INDEX2LABEL = {1: 'positive', 0: 'negative', 2: 'neutral'} # Added 2: 'neutral'

NUM_LABELS = 3 # Updated to 3

def load_dataset(self, path):

df = pd.read_csv(path)

return df

def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):


self.data = self.load_dataset(dataset_path)

self.tokenizer = tokenizer

self.no_special_token = no_special_token

def __getitem__(self, index):

data = self.data.loc[index,:]

text, sentiment = data['review_text'], data['category']

subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)

# Convert sentiment to string before returning

sentiment = str(sentiment) # Assuming sentiment is a single value

return np.array(subwords), np.array(sentiment), data['review_text']

def __len__(self):

return len(self.data)

class DocumentSentimentDataLoader(DataLoader):

def __init__(self, max_seq_len=512, *args, **kwargs):

super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)

self.collate_fn = self._collate_fn

self.max_seq_len = max_seq_len

def _collate_fn(self, batch):

batch_size = len(batch)
max_seq_len = max(map(lambda x: len(x[0]), batch))

max_seq_len = min(self.max_seq_len, max_seq_len)

subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)

mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)

sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)

seq_list = []

for i, (subwords, sentiment, raw_seq) in enumerate(batch):

subwords = subwords[:max_seq_len]

subword_batch[i,:len(subwords)] = subwords

mask_batch[i,:len(subwords)] = 1

# Convert sentiment label to numerical representation using LABEL2INDEX

sentiment_batch[i,0] = DocumentSentimentDataset.LABEL2INDEX.get(sentiment, -1) # -1 for


unknown labels

seq_list.append(raw_seq)

return subword_batch, mask_batch, sentiment_batch, seq_list

# Forward function for sequence classification

def forward_sequence_classification(model, batch_data):

# Unpack batch data

batch_input_ids, batch_attention_mask, batch_labels = batch_data


# Forward pass

outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask,


labels=batch_labels)

loss = outputs.loss

logits = outputs.logits

# Convert predictions and labels to lists

batch_hyp = torch.argmax(logits, dim=1).tolist()

batch_label = batch_labels.tolist()

return loss, batch_hyp, batch_label

def document_sentiment_metrics_fn(list_hyp, list_label):

metrics = {}

metrics["ACC"] = accuracy_score(list_label, list_hyp)

metrics["F1"] = f1_score(list_label, list_hyp, average='macro')

metrics["REC"] = recall_score(list_label, list_hyp, average='macro')

metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')

return metrics

X_train, X_val, y_train, y_val, indices_train, indices_val = train_test_split(train["review_text"],

train['category'], train.index, stratify = train["category"],

test_size=0.25, random_state=50)

pd.concat([X_train,y_train],axis=1).reset_index(drop=True).to_csv("train_1.csv",index=False)

pd.concat([X_val,y_val],axis=1).reset_index(drop=True).to_csv("validation_1.csv",index=False)
pd.concat([test[["review_text"]],test[['category']]],
axis=1).reset_index(drop=True).to_csv("test.csv",index=False)

# Load Tokenizer and Config

tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')

config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model

model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1',
config=config)

w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL

print(w2i)

print(i2w)

texts = X_train.tolist()

sentimen_analysis_labels = []

# Perform sentiment analysis

for text in texts:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')


sentimen_analysis_labels.append(label)

def objective(trial):

# Define the hyperparameters to optimize

learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)

weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

# Create the optimizer with the suggested hyperparameters

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training loop

# ... your training loop here ...

model.train()

train_subset = list(zip(X_train, y_train))

for epoch in range(5): # Adjust number of epochs

for text, label in train_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)

optimizer.zero_grad()

outputs = model(subwords, labels=labels)

loss = outputs.loss

loss.backward()

optimizer.step()
# Calculate and return the evaluation metric

# Example: Calculate F1-score on a small validation set

val_subset = list(zip(X_val, y_val))

correct_predictions = 0

total_predictions = 0

model.eval()

for text, label in val_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

predicted_label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

if i2w[predicted_label] == label:

correct_predictions +=1

total_predictions+=1

f1 = correct_predictions/total_predictions

return f1

# Create an Optuna study

study = optuna.create_study(direction="maximize")

# Run the optimization

study.optimize(objective, n_trials=3)
# Print the best trial

print("Best trial:")

trial = study.best_trial

print(" Value: ", trial.value)

print(" Params: ")

for key, value in trial.params.items():

print(" {}: {}".format(key, value))

# Print the best hyperparameters

print("Best hyperparameters:", study.best_params)

print("Best value:", study.best_value)

from optuna.visualization import plot_parallel_coordinate

plot_parallel_coordinate(study).show()

from optuna.visualization import plot_optimization_history

plot_optimization_history(study).show()

optimizer = torch.optim.AdamW(model.parameters(), lr=study.best_params['learning_rate'],


weight_decay=study.best_params['weight_decay'])

# Store losses for plotting

train_losses = []

val_losses = []

# Training loop
# ... your training loop here ...

model.train()

train_subset = list(zip(X_train, y_train))

for epoch in range(5): # Adjust number of epochs

epoch_train_loss = 0.0

for text, label in train_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)

optimizer.zero_grad()

outputs = model(subwords, labels=labels)

loss = outputs.loss

loss.backward()

optimizer.step()

epoch_train_loss += loss.item()

# Average training loss for the epoch

avg_train_loss = epoch_train_loss / len(train_subset)

train_losses.append(avg_train_loss)

print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

# Calculate and return the evaluation metric

# Example: Calculate F1-score on a small validation set

val_subset = list(zip(X_val, y_val))

correct_predictions = 0
total_predictions = 0

model.eval()

val_loss = 0.0

for text, label in val_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)

outputs = model(subwords, labels=labels)

val_loss += outputs.loss.item()

# Average validation loss for the epoch

avg_val_loss = val_loss / len(val_subset)

val_losses.append(avg_val_loss)

print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")

# Plot train and validation losses

plt.figure(figsize=(10, 6))

plt.plot(train_losses, label="Train Loss")

plt.plot(val_losses, label="Validation Loss")

plt.xlabel("Epoch")

plt.ylabel("Loss")

plt.title("Train and Validation Loss")

plt.legend()

plt.grid(True)

plt.show()
# Evaluate on train data

texts = X_train.tolist()

sentimen_analysis_tuned_labels = []

# Perform sentiment analysis

for text in texts:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

sentimen_analysis_tuned_labels.append(label)

# Evaluate on test

test = test.dropna()

test_texts = test['review_text'].tolist()

test_sentimen_analysis_labels = []

# Perform sentiment analysis

for text in test_texts:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')


test_sentimen_analysis_labels.append(label)

pred = pd.DataFrame({'label':test_sentimen_analysis_labels}).reset_index()

pred.to_csv('pred.csv', index=False)

# Create a DataFrame from the collected labels

train_result_df = pd.DataFrame({

'label': y_train.tolist(),

'sentimen_analysis': sentimen_analysis_labels,

'sentimen_analysis_tuned': sentimen_analysis_tuned_labels

})

# Concatenate the labels DataFrame with your original DataFrame (e.g. X_train)

X_train_df = X_train.to_frame()

train_results_df = pd.concat([X_train_df, train_result_df], axis=1)

print(train_results_df.head())

train_results_df.to_csv('train_results.csv', index=False)

# Mapping nilai kategori ke numerik

label_mapping = {'positive': 1, 'neutral': 2, 'negative': 0}

# Mengonversi kolom menjadi list dengan map

train_results_df = train_results_df.dropna()

true_label = train_results_df['label'].map(label_mapping).tolist()
# Calculate metrics for the non-optimized model

metrics_no_optimization = document_sentiment_metrics_fn(train_results_df['sentimen_analysis'],
true_label)

print("Metrics (No Optimization):", metrics_no_optimization)

# Calculate metrics for the Bayesian-optimized model

metrics_bayesian = document_sentiment_metrics_fn(train_results_df['sentimen_analysis_tuned'],
true_label)

print("Metrics (Bayesian Optimization):", metrics_bayesian)

# Create a DataFrame from the collected labels

test_results_df = pd.DataFrame({

'review_text': test['review_text'].tolist(),

'label': test['category'].tolist(),

'sentimen_analysis': pred['label']

})

#test_results_df = test_results_df.dropna()

print(test_results_df.head())

# Mapping nilai kategori ke numerik

label_mapping = {'positive': 1, 'neutral': 2, 'negative': 0}

# Mengonversi kolom menjadi list dengan map

true_test_label = test_results_df['label'].map(label_mapping).tolist()

# Calculate metrics for the Bayesian-optimized model


test_metrics_bayesian = document_sentiment_metrics_fn(test_results_df['sentimen_analysis'],
true_test_label)

print("Metrics (Bayesian Optimization) on test data:", test_metrics_bayesian)

jelaskan langkah langkah yang saya harus masukkan di metodologi penelitian,, serta jelaskan tahapan
"hasil dan pembahasan" berdasarkan coding

You might also like