Script Unit2
Script Unit2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
Panggil Dataset
url =
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.
data"
df.head(5)
perlu diperhatikan, jika menggunakan google drive sebagai tempat untuk meletakkan dataset, maka
gunakan dua script berikut ini untuk mendefinisikan lokasi file dan menghubungkan google drive
dengan colab.
import sys
sys.path.append(f'{folder_name}')
memuat dataset
df =pd.read_csv(f'{folder_name}/adult.data',
sep=',',names=column_names, skipinitialspace=True, na_values="?")
df.head(5)
df.info()
missing_values = df.isnull().sum()
print("\nJumlah missing value per kolom:")
print(missing_values)
df.describe()
df.describe(include=['object'])
df['income'].value_counts(normalize=True)
Melakukan visualisasi Distribusi
numeric_features = df.select_dtypes(include=[np.number]).columns
df[numeric_features].hist(figsize=(15, 10))
plt.tight_layout()
plt.show()
categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
plt.figure(figsize=(10, 5))
df[feature].value_counts().plot(kind='bar')
plt.title(f'Distribution of {feature}')
plt.ylabel('Count')
plt.xlabel(feature)
plt.xticks(rotation=45)
plt.show()
analisis korelasi
correlation_matrix = df[numeric_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numeric Features')
plt.show()
identifikasi Outlier
plt.figure(figsize=(15, 10))
df[numeric_features].boxplot()
plt.title('Box Plots of Numeric Features')
plt.xticks(rotation=90)
plt.show()