0% found this document useful (0 votes)

7 views35 pages

Coding

The document outlines a data processing workflow for analyzing text reviews using Python's pandas and other libraries. It includes steps for cleaning the text data, removing stopwords, counting words and characters, and performing sentiment analysis using TextBlob. Finally, it visualizes the sentiment distribution and character length comparisons across different sentiment categories.

Uploaded by

Ummu Salama

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

7 views35 pages

Coding

Uploaded by

Ummu Salama

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 35

text = pd.

DataFrame({'review_text': data['text']})

text.head()

train, test = train_test_split(text,test_size=0.20, random_state=50)

# Cek apakah ada data null (missing values) di dataset

print("Jumlah missing values pada dataset train:")

print(train.isnull().sum())

print("\nJumlah missing values pada dataset test:")

print(test.isnull().sum())

def create_corpus(data, var):

corpus=[]

for x in data[var].str.split():

for i in x:

corpus.append(i)

return corpus

def remove_indonesian_stop(data, feature) :

stopwords = fix_stopwords

filtered = []

for text in data[feature] :

text_list = []
for word in text.split() :

if word not in stopwords :

text_list.append(word)

text_list = ' '.join(text_list)

filtered.append(text_list)

return filtered

def num_count(data, feature) :

#kompas_list = list(set(kompas['words']))

num_list = []

for sentence in tqdm(data[feature]) :

num = 0

for word in sentence :

if word.isdigit() :

num += 1

else :

num += 0

num_list.append(num)

return num_list

def plot_unique_word_count(corpus, width, height, range1, range2, title, color, ax = None) :

words = []

values = []

len_words = []

for word, value in zip(pd.DataFrame(corpus).value_counts().index,

pd.DataFrame(corpus).value_counts()) :
words.append(word[0])

values.append(value)

len_words.append(len(word[0]))

res = pd.DataFrame({'words' : words,

'values' : values,

'len_words' : len_words})

res = res.sort_values(by = 'values', ascending = False)

#plt.figure(figsize = [width, height])

ax = ax

ax.set_title(title)

sns.barplot(data = res[range1:range2],

y = 'words',

x = 'values',

color = color,

ax = ax)

for index, value in enumerate(res['values'].iloc[range1:range2]) :

plt.text(value, index, value)

#plt.show()

return ax

stopwords = ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti',
'jika', 'jika', 'sehingga', 'kembali', 'dan', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'setelah', 'kami',
'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa',
'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara',
'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka',
'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun',
'sampai', 'sedangkan', 'selagi', 'sementara', 'apakah', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya',
'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'ingin', 'juga', 'nggak',
'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', 'setiap', 'setidaknya', 'sesuatu', 'pasti',
'saja', 'toh', 'ya', 'walau', 'tolong', 'tentu', 'amat', 'apalagi', 'bagaimanapun', 'sekali', 'jadi', 'nya']

keep_stopwords = ['tidak', 'sementara', 'belum', 'tetapi', 'kecuali', 'tapi', 'ada', 'tanpa', 'nggak', 'ok',
'hanya', 'kurang']

fix_stopwords = [next if word in keep_stopwords else word for word in stopwords]

words_dict = {

'tdk' : 'tidak',

'yg' : 'yang',

'ga' : 'tidak',

'gak' : 'tidak',

'tp' : 'tapi',

'd' : 'di',

'sy' : 'saya',

'&' : 'dan',

'dgn' : 'dengan',

'utk' : 'untuk',

'gk' : 'tidak',

'jd' : 'jadi',

'jg' : 'juga',

'dr' : 'dari',

'krn' : 'karena',

'aja' : 'saja',

'karna' : 'karena',

'udah' : 'sudah',

'kmr' : 'kamar',
'g' : 'tidak',

'dpt' : 'dapat',

'banget' : 'sekali',

'bgt' : 'sekali',

'kalo' : 'kalau',

'n' : 'dan',

'bs' : 'bisa',

'oke' : 'ok',

'dg' : 'dengan',

'pake' : 'pakai',

'sampe' : 'sampai',

'dapet' : 'dapat',

'ad' : 'ada',

'lg' : 'lagi',

'bikin' : 'buat',

'tak' : 'tidak',

'ny' : 'nya',

'ngga' : 'tidak',

'nunggu' : 'tunggu',

'klo' : 'kalau',

'blm' : 'belum',

'trus' : 'terus',

'kayak' : 'seperti',

'dlm' : 'dalam',

'udh' : 'sudah',
'tau' : 'tahu',

'org' : 'orang',

'hrs' : 'harus',

'msh' : 'masih',

'sm' : 'sama',

'byk' : 'banyak',

'krg' : 'kurang',

'kmar' : 'kamar',

'spt' : 'seperti',

'pdhl' : 'padahal',

'chek' : 'cek',

'pesen' : 'pesan',

'kran' : 'keran',

'gitu' : 'begitu',

'tpi' : 'tapi',

'lbh' : 'lebih',

'tmpt' : 'tempat',

'dikasi' : 'dikasih',

'serem' : 'seram',

'sya' : 'saya',

'jgn' : 'jangan',

'dri' : 'dari',

'dtg' : 'datang',

'gada' : 'tidak ada',

'standart' : 'standar',
'mlm' : 'malam',

'k' : 'ke',

'kl' : 'kalau',

'sgt': 'sangat',

'y' : 'ya',

'krna' : 'karena',

'tgl' : 'tanggal',

'terimakasih' : 'terima kasih',

'kecoak' : 'kecoa',

'pd' : 'pada',

'tdr' : 'tidur',

'jdi' : 'jadi',

'kyk' : 'seperti',

'sdh' : 'sudah',

'ama' : 'sama',

'gmana' : 'bagaimana',

'dalem' : 'dalam',

'tanyak' : 'tanya',

'taru' : 'taruh',

'gede' : 'besar',

'kaya' : 'seperti',

'access' : 'akses',

'tetep' : 'tetap',

'mgkin' : 'mungkin',

'sower' : 'shower',
'idup' : 'hidup',

'nyaaa' : 'nya',

'baikk' : 'baik',

'hanay' : 'hanya',

'tlp' : 'telpon',

'kluarga' : 'keluarga',

'jln' : 'jalan',

'hr' : 'hari',

'ngak' : 'tidak',

'bli' : 'beli',

'kmar' : 'kamar',

'naro' : 'taruh'

train['review_text_cleaned'] = [i.lower() for i in train['review_text']]

test['review_text_cleaned'] = [i.lower() for i in test['review_text']]

# removing \x00 characters

train['review_text_cleaned'] = [re.sub(r'[^\x00-\x7f]',r'', i) for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [re.sub(r'[^\x00-\x7f]',r'', i) for i in test['review_text_cleaned']]

#r removing \n

train['review_text_cleaned'] = [re.sub(r'\n', r' ', i) for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [re.sub(r'\n', r' ', i) for i in test['review_text_cleaned']]

#r removing numbers

train['review_text_cleaned'] = [re.sub(r"\d+", r"", i) for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [re.sub(r"\d+", r"", i) for i in test['review_text_cleaned']]

#removing punctuation

train['review_text_cleaned'] = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

for i in train['review_text_cleaned']]

test['review_text_cleaned'] = [i.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

for i in test['review_text_cleaned']]

def num_count(data, feature) :

#kompas_list = list(set(kompas['words']))

num_list = []

for sentence in tqdm(data[feature]) :

num = 0

for word in sentence :

if word.isdigit() :

num += 1

else :

num += 0

num_list.append(num)

return num_list

train['num_of_space'] = [i.count(' ') for i in train['review_text_cleaned']]

test['num_of_space'] = [i.count(' ') for i in test['review_text_cleaned']]

train['num_of_words'] = [len(i.split()) for i in train['review_text_cleaned']]

test['num_of_words'] = [len(i.split()) for i in test['review_text_cleaned']]

train['num_unique_char'] = [len(set(i)) for i in train['review_text_cleaned']]

test['num_unique_char'] = [len(set(i)) for i in test['review_text_cleaned']]

train['len_string_initial'] = [len(i) for i in train['review_text']]

test['len_string_initial'] = [len(i) for i in test['review_text']]

train['num_of_numeric'] = num_count(train, 'review_text')

test['num_of_numeric'] = num_count(test, 'review_text')

list_sentence_train = []

for sentence in tqdm(train['review_text_cleaned']) :

cleaned_sentence = [words_dict[word] if word in list(words_dict.keys()) else word for word in

sentence.split()]

list_sentence_train.append(' '.join(cleaned_sentence))

train['review_text_cleaned'] = list_sentence_train

list_sentence_test = []

for sentence in tqdm(test['review_text_cleaned']) :

cleaned_sentence = [words_dict[word] if word in list(words_dict.keys()) else word for word in

sentence.split()]

list_sentence_test.append(' '.join(cleaned_sentence))

test['review_text_cleaned'] = list_sentence_test
#removing indonesian stopwords

train['review_text_cleaned_nostopwords'] = remove_indonesian_stop(train, 'review_text_cleaned')

test['review_text_cleaned_nostopwords'] = remove_indonesian_stop(test, 'review_text_cleaned')

# Ubah DataFrame menjadi list of dictionaries

tweet_list = train.to_dict('records')

hasil_analisis = []

for tweet in tweet_list:

tweet_properties = {}

tweet_bersih = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","

",tweet["review_text"]).split())

analysis = TextBlob(tweet_bersih)

if analysis.sentiment.polarity > 0.0:

tweet_properties["category"] = 'positive'

elif analysis.sentiment.polarity < 0.0:

tweet_properties["category"] = 'negative'

else:

tweet_properties["category"] = 'neutral'

hasil_analisis.append(tweet_properties)

hasil_analisis_df = pd.DataFrame(hasil_analisis)
# Gabungkan kolom 'category' dari hasil_analisis_df ke train berdasarkan indeks

train = pd.concat([train, hasil_analisis_df['category']], axis=1)

# Ubah DataFrame menjadi list of dictionaries

tweet_list = test.to_dict('records')

hasil_analisis = []

for tweet in tweet_list:

tweet_properties = {}

tweet_bersih = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","

",tweet["review_text"]).split())

analysis = TextBlob(tweet_bersih)

if analysis.sentiment.polarity > 0.0:

tweet_properties["category"] = 'positive'

elif analysis.sentiment.polarity < 0.0:

tweet_properties["category"] = 'negative'

else:

tweet_properties["category"] = 'neutral'

hasil_analisis.append(tweet_properties)

hasil_analisis_df = pd.DataFrame(hasil_analisis)
# Gabungkan kolom 'category' dari hasil_analisis_df ke train berdasarkan indeks

test = pd.concat([test, hasil_analisis_df['category']], axis=1)

# Sampel ukuran dan data

sample_size = 100

sample_train = train.sample(n=sample_size, random_state=42)

# Plot konfigurasi

plt.figure(figsize=(10, 5), dpi=90) # Gunakan tuple untuk figsize

sns.set_style('whitegrid')

# Membuat countplot

ax = sns.countplot(

x='category',

data=sample_train,

palette=['#F42E56', '#808080', '#31B057']

# Mengganti label sumbu x

ax.set_xticklabels(['Positive', 'Netral', 'Negative'])

# Menghitung persentase dan menambahkan label

total = len(sample_train)

for p in ax.patches:

percentage = '{:.1f}%'.format(100 * p.get_height() / total)

x = p.get_x() + p.get_width() / 2 # Posisi x di tengah bar

y = p.get_height() # Posisi y di atas bar

ax.annotate(percentage, (x, y), ha='center', va='bottom')

# Menambahkan judul

plt.title('Class Distribution Plot (Sample)', fontsize=14)

# Menampilkan plot

plt.show()

plt.figure(figsize = [15, 5], dpi = 90)

sns.distplot(train[train['category'] == 'negative']['len_string_initial'],

color = '#F42E56',

label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']

['len_string_initial'].mean()))

sns.distplot(train[train['category'] == 'neutral']['len_string_initial'],

color = '#808080',

label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']

['len_string_initial'].mean()))

sns.distplot(train[train['category'] == 'positive']['len_string_initial'],

color = '#31B057',

label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']

['len_string_initial'].mean()))

plt.title('Distplot Comparison Based On Length of Characters')

plt.legend()

plt.figure(figsize = [15, 5], dpi = 90)

sns.distplot(train[train['category'] == 'negative']['num_of_words'],

color = '#F42E56',

label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']

['num_of_words'].mean()))

sns.distplot(train[train['category'] == 'neutral']['num_of_words'],

color = '#808080',

label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']

['num_of_words'].mean()))

sns.distplot(train[train['category'] == 'positive']['num_of_words'],

color = '#31B057',

label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']

['num_of_words'].mean()))

plt.title('Distplot Comparison Based on Number of Words')

plt.legend()

plt.figure(figsize = [15, 5], dpi = 90)

sns.distplot(train[train['category'] == 'negative']['num_unique_char'],

color = '#F42E56',

label = 'negative; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'negative']

['num_unique_char'].mean()))

sns.distplot(train[train['category'] == 'neutral']['num_unique_char'],

color = '#808080',

label = 'neutral; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'neutral']

['num_unique_char'].mean()))

sns.distplot(train[train['category'] == 'positive']['num_unique_char'],

color = '#31B057',

label = 'positive; mean = {mean}'.format(mean = '%.0f'%train[train['category'] == 'positive']

['num_unique_char'].mean()))
plt.title('Distplot Comparison Based on Unique Characters')

plt.legend()

# Menghapus baris dengan nilai NaN pada 'review_text_cleaned'

train = train.dropna()

fig = plt.figure(figsize = [15, 15])

ax1 = fig.add_subplot(1,2,1)

#class_distribution("colname1", ax=ax1)

plot_unique_word_count(create_corpus(train[train['category'] == 'negative'],"review_text_cleaned"),

15,

18,

50,

'Negative Sentiment Unigram (with all stopwords) \n',

'#F42E56', ax = ax1)

ax2 = fig.add_subplot(1,2,2)

plot_unique_word_count(create_corpus(train[train['category'] == 'negative'],
'review_text_cleaned_nostopwords'),

50,

'Negative Sentiment Unigram (keeping some stopwords) \n',

'#F42E56', ax = ax2)

fig = plt.figure(figsize = [15, 15])

ax1 = fig.add_subplot(1,2,1)

#class_distribution("colname1", ax=ax1)

plot_unique_word_count(create_corpus(train[train['category'] == 'positive'], 'review_text_cleaned'),

15,

18,

50,

'Positive Sentiment Unigram (keeping some stopwords) \n',

'#31B057', ax = ax1)

ax2 = fig.add_subplot(1,2,2)

plot_unique_word_count(create_corpus(train[train['category'] == 'positive'],
'review_text_cleaned_nostopwords'),

50,

'Positive Sentiment Unigram (keeping some stopwords) \n',

'#31B057', ax = ax2)

from sklearn.feature_extraction.text import CountVectorizer

def get_top_tweet_bigrams(corpus, n=None):

vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

fig = plt.figure(figsize=(18,18))

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 1)

sns.barplot(x=y,y=x, color = '#F42E56')

plt.title('Negative Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 2)

sns.barplot(x=y,y=x, color = '#31B057')

plt.title('Positive Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :
plt.text(i[1]+.5, index+.2, i[1])

index+=1

from sklearn.feature_extraction.text import CountVectorizer

def get_top_tweet_bigrams(corpus, n=None):

vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

fig = plt.figure(figsize=(18,18))

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 1)

sns.barplot(x=y,y=x, color = '#F42E56')

plt.title('Negative Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))
fig.add_subplot(1, 2, 2)

sns.barplot(x=y,y=x, color = '#31B057')

plt.title('Positive Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

from sklearn.feature_extraction.text import CountVectorizer

def get_top_tweet_bigrams(corpus, n=None):

vec = CountVectorizer(ngram_range=(4, 4)).fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

fig = plt.figure(figsize=(18,18))

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'negative']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 1)

sns.barplot(x=y,y=x, color = '#F42E56')

plt.title('Negative Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

top_tweet_bigrams=get_top_tweet_bigrams(train[train['category'] == 'positive']
['review_text_cleaned_nostopwords'])[:50]

x,y=map(list,zip(*top_tweet_bigrams))

fig.add_subplot(1, 2, 2)

sns.barplot(x=y,y=x, color = '#31B057')

plt.title('Positive Sentiment Bigrams \n')

index = 0

for i in top_tweet_bigrams :

plt.text(i[1]+.5, index+.2, i[1])

index+=1

#####

# Document Sentiment

#####

class TextDataset(Dataset):

def init(self, texts, labels, tokenizer, max_length=128):

self.texts = texts

self.labels = labels

self.tokenizer = tokenizer

self.max_length = max_length

def __len__(self):

return len(self.texts)

def getitem(self, idx):

text = self.texts[idx]

label = self.labels[idx]

encodings = self.tokenizer(

text,

truncation=True,

padding="max_length",

max_length=self.max_length,

return_tensors="pt",

return {

"input_ids": encodings["input_ids"].squeeze(),

"attention_mask": encodings["attention_mask"].squeeze(),

"label": torch.tensor(label, dtype=torch.long),

class DocumentSentimentDataset(Dataset):

# Static constant variable

LABEL2INDEX = {'positive': 1, 'negative': 0, 'neutral': 2} # Added 'neutral': 2

INDEX2LABEL = {1: 'positive', 0: 'negative', 2: 'neutral'} # Added 2: 'neutral'

NUM_LABELS = 3 # Updated to 3

def load_dataset(self, path):

df = pd.read_csv(path)

return df

def init(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):

self.data = self.load_dataset(dataset_path)

self.tokenizer = tokenizer

self.no_special_token = no_special_token

def getitem(self, index):

data = self.data.loc[index,:]

text, sentiment = data['review_text'], data['category']

subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)

# Convert sentiment to string before returning

sentiment = str(sentiment) # Assuming sentiment is a single value

return np.array(subwords), np.array(sentiment), data['review_text']

def __len__(self):

return len(self.data)

class DocumentSentimentDataLoader(DataLoader):

def init(self, max_seq_len=512, *args, **kwargs):

super(DocumentSentimentDataLoader, self).init(*args, **kwargs)

self.collate_fn = self._collate_fn

self.max_seq_len = max_seq_len

def _collate_fn(self, batch):

batch_size = len(batch)
max_seq_len = max(map(lambda x: len(x[0]), batch))

max_seq_len = min(self.max_seq_len, max_seq_len)

subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)

mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)

sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)

seq_list = []

for i, (subwords, sentiment, raw_seq) in enumerate(batch):

subwords = subwords[:max_seq_len]

subword_batch[i,:len(subwords)] = subwords

mask_batch[i,:len(subwords)] = 1

# Convert sentiment label to numerical representation using LABEL2INDEX

sentiment_batch[i,0] = DocumentSentimentDataset.LABEL2INDEX.get(sentiment, -1) # -1 for

unknown labels

seq_list.append(raw_seq)

return subword_batch, mask_batch, sentiment_batch, seq_list

# Forward function for sequence classification

def forward_sequence_classification(model, batch_data):

# Unpack batch data

batch_input_ids, batch_attention_mask, batch_labels = batch_data

# Forward pass

outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask,

labels=batch_labels)

loss = outputs.loss

logits = outputs.logits

# Convert predictions and labels to lists

batch_hyp = torch.argmax(logits, dim=1).tolist()

batch_label = batch_labels.tolist()

return loss, batch_hyp, batch_label

def document_sentiment_metrics_fn(list_hyp, list_label):

metrics = {}

metrics["ACC"] = accuracy_score(list_label, list_hyp)

metrics["F1"] = f1_score(list_label, list_hyp, average='macro')

metrics["REC"] = recall_score(list_label, list_hyp, average='macro')

metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')

return metrics

X_train, X_val, y_train, y_val, indices_train, indices_val = train_test_split(train["review_text"],

train['category'], train.index, stratify = train["category"],

test_size=0.25, random_state=50)

pd.concat([X_train,y_train],axis=1).reset_index(drop=True).to_csv("train_1.csv",index=False)

pd.concat([X_val,y_val],axis=1).reset_index(drop=True).to_csv("validation_1.csv",index=False)
pd.concat([test[["review_text"]],test[['category']]],
axis=1).reset_index(drop=True).to_csv("test.csv",index=False)

# Load Tokenizer and Config

tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')

config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model

model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1',
config=config)

w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL

print(w2i)

print(i2w)

texts = X_train.tolist()

sentimen_analysis_labels = []

# Perform sentiment analysis

for text in texts:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

sentimen_analysis_labels.append(label)

def objective(trial):

# Define the hyperparameters to optimize

learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)

weight_decay = trial.suggest_float("weight_decay", 1e-10, 1e-3, log=True)

# Create the optimizer with the suggested hyperparameters

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training loop

# ... your training loop here ...

model.train()

train_subset = list(zip(X_train, y_train))

for epoch in range(5): # Adjust number of epochs

for text, label in train_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)

optimizer.zero_grad()

outputs = model(subwords, labels=labels)

loss = outputs.loss

loss.backward()

optimizer.step()
# Calculate and return the evaluation metric

# Example: Calculate F1-score on a small validation set

val_subset = list(zip(X_val, y_val))

correct_predictions = 0

total_predictions = 0

model.eval()

for text, label in val_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

predicted_label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

if i2w[predicted_label] == label:

correct_predictions +=1

total_predictions+=1

f1 = correct_predictions/total_predictions

return f1

# Create an Optuna study

study = optuna.create_study(direction="maximize")

# Run the optimization

study.optimize(objective, n_trials=3)
# Print the best trial

print("Best trial:")

trial = study.best_trial

print(" Value: ", trial.value)

print(" Params: ")

for key, value in trial.params.items():

print(" {}: {}".format(key, value))

# Print the best hyperparameters

print("Best hyperparameters:", study.best_params)

print("Best value:", study.best_value)

from optuna.visualization import plot_parallel_coordinate

plot_parallel_coordinate(study).show()

from optuna.visualization import plot_optimization_history

plot_optimization_history(study).show()

optimizer = torch.optim.AdamW(model.parameters(), lr=study.best_params['learning_rate'],

weight_decay=study.best_params['weight_decay'])

# Store losses for plotting

train_losses = []

val_losses = []

# Training loop
# ... your training loop here ...

model.train()

train_subset = list(zip(X_train, y_train))

for epoch in range(5): # Adjust number of epochs

epoch_train_loss = 0.0

for text, label in train_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)

optimizer.zero_grad()

outputs = model(subwords, labels=labels)

loss = outputs.loss

loss.backward()

optimizer.step()

epoch_train_loss += loss.item()

# Average training loss for the epoch

avg_train_loss = epoch_train_loss / len(train_subset)

train_losses.append(avg_train_loss)

print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

# Calculate and return the evaluation metric

# Example: Calculate F1-score on a small validation set

val_subset = list(zip(X_val, y_val))

correct_predictions = 0
total_predictions = 0

model.eval()

val_loss = 0.0

for text, label in val_subset:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

labels = torch.tensor([w2i[label]], dtype=torch.long, device=model.device,requires_grad=False)

outputs = model(subwords, labels=labels)

val_loss += outputs.loss.item()

# Average validation loss for the epoch

avg_val_loss = val_loss / len(val_subset)

val_losses.append(avg_val_loss)

print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")

# Plot train and validation losses

plt.figure(figsize=(10, 6))

plt.plot(train_losses, label="Train Loss")

plt.plot(val_losses, label="Validation Loss")

plt.xlabel("Epoch")

plt.ylabel("Loss")

plt.title("Train and Validation Loss")

plt.legend()

plt.grid(True)

plt.show()
# Evaluate on train data

texts = X_train.tolist()

sentimen_analysis_tuned_labels = []

# Perform sentiment analysis

for text in texts:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

sentimen_analysis_tuned_labels.append(label)

# Evaluate on test

test = test.dropna()

test_texts = test['review_text'].tolist()

test_sentimen_analysis_labels = []

# Perform sentiment analysis

for text in test_texts:

subwords = tokenizer.encode(text)

subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]

label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

test_sentimen_analysis_labels.append(label)

pred = pd.DataFrame({'label':test_sentimen_analysis_labels}).reset_index()

pred.to_csv('pred.csv', index=False)

# Create a DataFrame from the collected labels

train_result_df = pd.DataFrame({

'label': y_train.tolist(),

'sentimen_analysis': sentimen_analysis_labels,

'sentimen_analysis_tuned': sentimen_analysis_tuned_labels

})

# Concatenate the labels DataFrame with your original DataFrame (e.g. X_train)

X_train_df = X_train.to_frame()

train_results_df = pd.concat([X_train_df, train_result_df], axis=1)

print(train_results_df.head())

train_results_df.to_csv('train_results.csv', index=False)

# Mapping nilai kategori ke numerik

label_mapping = {'positive': 1, 'neutral': 2, 'negative': 0}

# Mengonversi kolom menjadi list dengan map

train_results_df = train_results_df.dropna()

true_label = train_results_df['label'].map(label_mapping).tolist()
# Calculate metrics for the non-optimized model

metrics_no_optimization = document_sentiment_metrics_fn(train_results_df['sentimen_analysis'],
true_label)

print("Metrics (No Optimization):", metrics_no_optimization)

# Calculate metrics for the Bayesian-optimized model

metrics_bayesian = document_sentiment_metrics_fn(train_results_df['sentimen_analysis_tuned'],
true_label)

print("Metrics (Bayesian Optimization):", metrics_bayesian)

# Create a DataFrame from the collected labels

test_results_df = pd.DataFrame({

'review_text': test['review_text'].tolist(),

'label': test['category'].tolist(),

'sentimen_analysis': pred['label']

})

#test_results_df = test_results_df.dropna()

print(test_results_df.head())

# Mapping nilai kategori ke numerik

label_mapping = {'positive': 1, 'neutral': 2, 'negative': 0}

# Mengonversi kolom menjadi list dengan map

true_test_label = test_results_df['label'].map(label_mapping).tolist()

# Calculate metrics for the Bayesian-optimized model

test_metrics_bayesian = document_sentiment_metrics_fn(test_results_df['sentimen_analysis'],
true_test_label)

print("Metrics (Bayesian Optimization) on test data:", test_metrics_bayesian)

jelaskan langkah langkah yang saya harus masukkan di metodologi penelitian,, serta jelaskan tahapan
"hasil dan pembahasan" berdasarkan coding

Ray Comfort - Scientific Facts in The Bible
100% (2)
Ray Comfort - Scientific Facts in The Bible
94 pages
Dokumen - Pub Xenolinguistics Towards A Science of Extraterrestrial Language 1nbsped 9781003352174 9781032399607 9781032399591
100% (2)
Dokumen - Pub Xenolinguistics Towards A Science of Extraterrestrial Language 1nbsped 9781003352174 9781032399607 9781032399591
248 pages
Seismic Principles: Solution
100% (1)
Seismic Principles: Solution
17 pages
Math8 q1 Mod6 Go Illustrating Rectangular Coordinate System 08092020
100% (4)
Math8 q1 Mod6 Go Illustrating Rectangular Coordinate System 08092020
27 pages
Human Error: Oleh: Edwina Rudyarti
100% (1)
Human Error: Oleh: Edwina Rudyarti
26 pages
Concrete Notes
No ratings yet
Concrete Notes
37 pages
Examples On Chapter 1 (1) #1
No ratings yet
Examples On Chapter 1 (1) #1
17 pages
CHAPTER Traversing
No ratings yet
CHAPTER Traversing
30 pages
RESUME
No ratings yet
RESUME
2 pages
PC Wet Wipes
100% (1)
PC Wet Wipes
27 pages
Level Ton Report
No ratings yet
Level Ton Report
72 pages
KEI Catalogue PdfToWord
No ratings yet
KEI Catalogue PdfToWord
47 pages
Natural Language Processing
No ratings yet
Natural Language Processing
22 pages
Vinh Tan 4 Thermal Power Plant Water Treatment System: Instructor: H. T. LEE (ATAI)
No ratings yet
Vinh Tan 4 Thermal Power Plant Water Treatment System: Instructor: H. T. LEE (ATAI)
34 pages
A New Correlation Between SPT and CPT For Various Soils
No ratings yet
A New Correlation Between SPT and CPT For Various Soils
8 pages
KP Intensif SMT 2 Tembus SMA Pradita Dirgantara (Matematika - Paket 5) 2023-2024
No ratings yet
KP Intensif SMT 2 Tembus SMA Pradita Dirgantara (Matematika - Paket 5) 2023-2024
5 pages
Web Scraping
No ratings yet
Web Scraping
9 pages
Day3-4 - Unit6 (Travel-Outdoor Life)
No ratings yet
Day3-4 - Unit6 (Travel-Outdoor Life)
2 pages
Bài Tập Phát Âm-trọng Âm Ôn Thi Vào Anh 10
No ratings yet
Bài Tập Phát Âm-trọng Âm Ôn Thi Vào Anh 10
34 pages
AP19110010110 Lab Assignment-2 - Jupyter Notebook
No ratings yet
AP19110010110 Lab Assignment-2 - Jupyter Notebook
18 pages
Sika Anchorfix - 3001
No ratings yet
Sika Anchorfix - 3001
5 pages
J.K. Institute of Applied Physics and Technology: Natural Language Processing Assignment
No ratings yet
J.K. Institute of Applied Physics and Technology: Natural Language Processing Assignment
22 pages
Simple NMT
No ratings yet
Simple NMT
3 pages
Psychodynamic Theory
No ratings yet
Psychodynamic Theory
3 pages
Source Code Python Jemmy
No ratings yet
Source Code Python Jemmy
7 pages
Jalaj Pandey Bca 5 Assignment 5
No ratings yet
Jalaj Pandey Bca 5 Assignment 5
5 pages
Python Case Study
No ratings yet
Python Case Study
3 pages
Stopword
No ratings yet
Stopword
1 page
Soundarya 256 NLP Practs
No ratings yet
Soundarya 256 NLP Practs
14 pages
IPBT Course 1
No ratings yet
IPBT Course 1
89 pages
NLTK - N-Gram LM
No ratings yet
NLTK - N-Gram LM
13 pages
Notes 231204 223511
No ratings yet
Notes 231204 223511
4 pages
Baudouin Company Profile 2021 RevI Web
No ratings yet
Baudouin Company Profile 2021 RevI Web
20 pages
Offline Grid Nesting
No ratings yet
Offline Grid Nesting
24 pages
Project Machine Learning Extended1
No ratings yet
Project Machine Learning Extended1
44 pages
NLP Expts
No ratings yet
NLP Expts
41 pages
Supermart Grocery Sales - Retail Analytics Dataset - (Data Analyst)
No ratings yet
Supermart Grocery Sales - Retail Analytics Dataset - (Data Analyst)
17 pages
Pract - 6 Morpholigical Using Add Delete Table
No ratings yet
Pract - 6 Morpholigical Using Add Delete Table
2 pages
DL Info Chaines
No ratings yet
DL Info Chaines
7 pages
Text Mining Basics
No ratings yet
Text Mining Basics
16 pages
String
No ratings yet
String
4 pages
Untitled4.ipynb - String
No ratings yet
Untitled4.ipynb - String
3 pages
Assignment No - 7
No ratings yet
Assignment No - 7
4 pages
Natural Language Processing
No ratings yet
Natural Language Processing
5 pages
PE Assignment PIA
No ratings yet
PE Assignment PIA
26 pages
20BCP123 - NLP Lab Manual
No ratings yet
20BCP123 - NLP Lab Manual
45 pages
5th Dec Python Basic
No ratings yet
5th Dec Python Basic
1 page
File Pgms
No ratings yet
File Pgms
4 pages
20BCP112 - NLP Lab - LAB - Manual
No ratings yet
20BCP112 - NLP Lab - LAB - Manual
65 pages
Code
No ratings yet
Code
5 pages
Se 3 Tal 5 Ees
No ratings yet
Se 3 Tal 5 Ees
1 page
NLP Projects
No ratings yet
NLP Projects
4 pages
Department of Civil Engineering Nit Srinagar Hazratbal, Jammu and Kashmir-190006
No ratings yet
Department of Civil Engineering Nit Srinagar Hazratbal, Jammu and Kashmir-190006
13 pages
Impact of Artificial Intelligence (AI) in Martian Architecture (Exterior and Interior)
No ratings yet
Impact of Artificial Intelligence (AI) in Martian Architecture (Exterior and Interior)
8 pages
Ai Art Cookbook
No ratings yet
Ai Art Cookbook
5 pages
Strings Problems
No ratings yet
Strings Problems
24 pages
115 Ir 7
No ratings yet
115 Ir 7
6 pages
Final Report (1) - 1
No ratings yet
Final Report (1) - 1
40 pages
SC Project Kaggle
No ratings yet
SC Project Kaggle
28 pages
Python Code For NLP
No ratings yet
Python Code For NLP
6 pages
.
No ratings yet
.
8 pages
Python Popular Codes
No ratings yet
Python Popular Codes
7 pages
String Manipulation Practice Sheet
No ratings yet
String Manipulation Practice Sheet
2 pages
NLP - (Natural Language Processing Lab Manual)
No ratings yet
NLP - (Natural Language Processing Lab Manual)
12 pages
A7 NLP Exp2
No ratings yet
A7 NLP Exp2
11 pages
x0 Process
No ratings yet
x0 Process
4 pages
Assignment#6-1 - 11-Arid-3624 - Jupyter Notebook
No ratings yet
Assignment#6-1 - 11-Arid-3624 - Jupyter Notebook
6 pages
NLP Lab
No ratings yet
NLP Lab
18 pages
String
No ratings yet
String
1 page
R22 NLP Python Programs
No ratings yet
R22 NLP Python Programs
15 pages
Program Flowchart
No ratings yet
Program Flowchart
1 page
Trends Merged
No ratings yet
Trends Merged
10 pages
1 - Write A Python Program To Perform Following Tasks On Text A) Tokenization
No ratings yet
1 - Write A Python Program To Perform Following Tasks On Text A) Tokenization
13 pages
Ouptput
No ratings yet
Ouptput
5 pages
DSBDA7
No ratings yet
DSBDA7
2 pages
NLP Pratical
No ratings yet
NLP Pratical
14 pages
Interview Question Code
No ratings yet
Interview Question Code
9 pages
String Prgs
No ratings yet
String Prgs
2 pages
Untitled 6
No ratings yet
Untitled 6
1 page
Building LLM From Scratch-Post 3
No ratings yet
Building LLM From Scratch-Post 3
16 pages
Program 9
No ratings yet
Program 9
2 pages
DSBDL Assn 07
No ratings yet
DSBDL Assn 07
4 pages
Head and Neck 3rd Edition Bernadette L Koch Digital Access
100% (1)
Head and Neck 3rd Edition Bernadette L Koch Digital Access
403 pages
Arabic 2 English
No ratings yet
Arabic 2 English
7 pages
Dusk To Dawn A Guide To Landscape Photography at Night Full Download
No ratings yet
Dusk To Dawn A Guide To Landscape Photography at Night Full Download
404 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
NLP Study Plan For Beginners - HW Samples
No ratings yet
NLP Study Plan For Beginners - HW Samples
47 pages
NLPPractical
No ratings yet
NLPPractical
12 pages
Exp1 NLP
No ratings yet
Exp1 NLP
2 pages