0% found this document useful (0 votes)
13 views

Script Unit2

Uploaded by

vemizadefitri123
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views

Script Unit2

Uploaded by

vemizadefitri123
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 6

Memanggil liblary yang dibutuhkan dalam data analysis:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

Panggil Dataset

Untuk pemanggilan langsung dari web :

url =
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.
data"

column_names = ['age', 'workclass', 'fnlwgt', 'education',


'education_num', 'marital_status', 'occupation', 'relationship',
'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income']

df = pd.read_csv(url, names=column_names, skipinitialspace=True,


na_values="?")

df.head(5)

Pemanggilan melalui drive yang terdapat pada google drive

perlu diperhatikan, jika menggunakan google drive sebagai tempat untuk meletakkan dataset, maka
gunakan dua script berikut ini untuk mendefinisikan lokasi file dan menghubungkan google drive
dengan colab.

Mendefinisikan variabel lokasi drive

# Define the variable


folder_name = "/content/drive/My Drive/Colab Notebooks"

membuat penghubung google drive dan colab.

from google.colab import drive


drive.mount('/content/drive', force_remount=True)

import sys
sys.path.append(f'{folder_name}')
memuat dataset

column_names = ['age', 'workclass', 'fnlwgt', 'education',


'education_num', 'marital_status', 'occupation', 'relationship',
'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income']

df =pd.read_csv(f'{folder_name}/adult.data',
sep=',',names=column_names, skipinitialspace=True, na_values="?")

df.head(5)

hasil pemanggilan Dataset

Lakukan Pembacaan dataset

df.info()

Pembacaan dataset terkait nilai yang hilang

missing_values = df.isnull().sum()
print("\nJumlah missing value per kolom:")
print(missing_values)

hasilnya akan terlihat seperti berikut ini :

menghitung persentase nilai yang hilang

missing_percentage = (missing_values / len(df)) * 100


print("\nPersentase missing value per kolom:")
print(missing_percentage)

Kegiatan Menelaah Data


Membaca deskrpsi data

df.describe()

Membaca deskripsi data untuk data dalam bentuk Teks ( object)

df.describe(include=['object'])

df['income'].value_counts(normalize=True)
Melakukan visualisasi Distribusi

numeric_features = df.select_dtypes(include=[np.number]).columns
df[numeric_features].hist(figsize=(15, 10))
plt.tight_layout()
plt.show()

untuk data bersifat objek ( kategorikal)

categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
plt.figure(figsize=(10, 5))
df[feature].value_counts().plot(kind='bar')
plt.title(f'Distribution of {feature}')
plt.ylabel('Count')
plt.xlabel(feature)
plt.xticks(rotation=45)
plt.show()

analisis korelasi

correlation_matrix = df[numeric_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numeric Features')
plt.show()

Analisis Hubungan Target Variabel

for feature in numeric_features:


plt.figure(figsize=(10, 6))
sns.boxplot(x='income', y=feature, data=df)
plt.title(f'{feature} vs Income')
plt.show()

Analisis untuk nilai kategorikal dengan target

for feature in categorical_features:


if feature != 'income':
plt.figure(figsize=(18, 6))
df_temp = df.groupby([feature, 'income']).size().unstack()
df_temp_perc = df_temp.div(df_temp.sum(axis=1), axis=0)
df_temp_perc.plot(kind='bar', stacked=True)
plt.title(f'{feature} vs Income')
plt.xlabel(feature)
plt.ylabel('Percentage')
plt.legend(title='Income', loc='upper right')
plt.xticks(rotation=45)
plt.show()

identifikasi Outlier

plt.figure(figsize=(15, 10))
df[numeric_features].boxplot()
plt.title('Box Plots of Numeric Features')
plt.xticks(rotation=90)
plt.show()

You might also like