0% found this document useful (0 votes)
13 views3 pages

Vanakam Bro

This document analyzes penguin size data from a CSV file. It explores the data distribution, visualizes relationships between variables, and imputes missing values. A variety of data visualization and analysis techniques are applied including ECDFs, box plots, pair plots, and summary statistics.

Uploaded by

JeYesh AJ
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views3 pages

Vanakam Bro

This document analyzes penguin size data from a CSV file. It explores the data distribution, visualizes relationships between variables, and imputes missing values. A variety of data visualization and analysis techniques are applied including ECDFs, box plots, pair plots, and summary statistics.

Uploaded by

JeYesh AJ
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

# -*- coding: utf-8 -*-

"""ASS 03.ipynb

Automatically generated by Colaboratory.

Original file is located at


https://colab.research.google.com/drive/1RX0dQAWd5l79JPyqHedCp_mi6Il4aU93
"""

# Commented out IPython magic to ensure Python compatibility.


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #simple data visualization
# %matplotlib inline
import seaborn as sns #some advanced data visualizations
import warnings
warnings.filterwarnings(’ignore’) # to get rid of warnings
plt.style.use(’seaborn-white’) #defining desired style of viz

import os
for dirname, _, filenames in os.walk(’/kaggle/input’):
for filename in filenames:
print(os.path.join(dirname, filename))

"""# New Section"""

from google.colab import drive


drive.mount(’/content/drive’)

df = pd.read_csv(’/content/penguins_size.csv’)
original = df.copy()

print(’Dataset has’, df.shape[0] , ’rows and’, df.shape[1], ’columns’)

df.info()

df.describe()

df.isnull().sum()

df.head(10)

plt.rcParams[’figure.figsize’] = (10,7)

df[’species’].value_counts(normalize = True).plot(kind = ’bar’, color = ’seagreen’, linewidth = 1, edgecolor


= ’k’)
plt.title(’Penguin Species’)
plt.xlabel(’Species’)
plt.ylabel(’% (100s)’)
plt.xticks(rotation = 360)
plt.show()

df[’island’].value_counts(normalize = True).plot(kind = ’bar’, color = ’seagreen’, linewidth = 1, edgecolor = ’


k’)
plt.title(’Islands where Penguins live’)
plt.xlabel(’Island’)
plt.ylabel(’% (100s)’)
plt.xticks(rotation = 360)
plt.show()

df[’sex’].value_counts(normalize = True).plot(kind = ’bar’, color = ’seagreen’, linewidth = 1, edgecolor = ’k’)


plt.title(’Penguins - Sex’)
plt.xlabel(’Sex’)
plt.ylabel(’% (100s)’)
plt.xticks(rotation = 360)
plt.show()

def ecdf(x):
n = len(x)
a = np.sort(x)
b = np.arange(1, 1 + n) / n
plt.subplot(211)
plt.plot(a, b, marker = ’.’, linestyle = ’None’, c = ’seagreen’)
mean_x = np.mean(x)
plt.axvline(mean_x, c = ’k’, label = ’Mean’)
plt.title(’ECDF’)
plt.legend()
plt.show()
plt.subplot(212)
sns.distplot(x, color = ’r’)
plt.title(’Probability Density Function’)
plt.show()

ecdf(df[’culmen_length_mm’])

ecdf(df[’culmen_depth_mm’])

ecdf(df[’flipper_length_mm’])

ecdf(df[’body_mass_g’])

def box(f):
sns.boxplot(y = f, x = ’species’, hue = ’sex’,data = df)
plt.title(f)
plt.show()

box(’culmen_length_mm’)

box(’culmen_depth_mm’)

box(’flipper_length_mm’)

box(’body_mass_g’)

sns.pairplot(df, hue = ’species’)


plt.show()

new_df = original.copy()
new_df[’culmen_length_mm’].fillna(np.mean(original[’culmen_length_mm’]), inplace = True)
new_df[’culmen_depth_mm’].fillna(np.mean(original[’culmen_depth_mm’]), inplace = True)
new_df[’flipper_length_mm’].fillna(np.mean(original[’flipper_length_mm’]), inplace = True)
new_df[’body_mass_g’].fillna(np.mean(original[’body_mass_g’]), inplace = True)
new_df[’sex’].fillna(original[’sex’].mode()[0], inplace = True)

new_df.head()

new_df.isnull().sum()

You might also like