Unit3 - Cleaning - Preparing - Data - Jupyter Notebook
Unit3 - Cleaning - Preparing - Data - Jupyter Notebook
import pandas as pd
d=pd.read_excel("C:\\Users\\Admin\\Desktop\\sree.xlsx")
df=pd.DataFrame(d)
df
Out[24]:
In [ ]:
####Pandas treat None and NaN as essentially interchangeable for indicating missin
#To facilitate this convention, there are several useful functions for detecting,
In [3]:
df.isnull()
Out[3]:
In [4]:
df.dropna()
Out[4]:
In [5]:
df
Out[5]:
In [7]:
df.dropna(inplace=True)
In [8]:
df
Out[8]:
In [11]:
import pandas as pd
d=pd.read_excel("C:\\Users\\Admin\\Desktop\\sree.xlsx")
df=pd.DataFrame(d)
df
Out[11]:
In [12]:
df.fillna(0)
Out[12]:
In [16]:
In [18]:
In [21]:
import pandas as pd
d=pd.read_excel("C:\\Users\\Admin\\Desktop\\sree.xlsx")
df=pd.DataFrame(d)
print(df.duplicated())
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 True
dtype: bool
In [22]:
In [23]:
In [ ]:
###Data Formatting###
In [9]:
import xlsxwriter
import csv
book=xlsxwriter.Workbook("dsp.xlsx")
Campus_Name="Rkvalley"
Branch_Name="CSE"
Section_Name="C"
format1=book.add_format({'bg_color':"orange",'border':1})
format2=book.add_format({'bg_color':"purple",'border':1})
s=book.add_worksheet("dsp")
s.write(1,0,"Campus name",format1)
s.write(1,1,Campus_Name,format2)
s.write(2,0,"Branch name",format1)
s.write(2,1,Branch_Name,format2)
s.write(3,0,"Section name",format1)
s.write(3,1,Section_Name,format2)
index=5
with open("stup.csv") as csvfile:
csv_reader= csv.reader(csvfile)
for row in csv_reader:
if index==5:
format=format1
else:
format=format2
s.write(index,0,row[0],format)
s.write(index,1,row[1],format)
s.write(index,2,row[2],format)
s.write(index,3,row[3],format)
s.write(index,4,row[4],format)
s.write(index,5,row[5],format)
index+= 1
book.close()
In [51]:
###BINNING##
import pandas as pd
d=pd.read_excel("C:\\Users\\Admin\\Desktop\\udaya.xlsx")
df=pd.DataFrame(d)
df
Out[51]:
0 Chiru 23 30 34 24 40 128
1 Venky 24 23 23 5 35 86
2 Balayya 23 34 20 35 37 126
3 Nag 35 23 32 29 35 119
4 Lakshman 21 29 10 26 29 94
5 Suresh 20 31 31 23 28 113
6 vijay 27 2 23 37 25 87
7 prabhas 30 37 29 34 9 109
8 bunny 28 37 26 29 37 129
9 anushka 25 27 24 22 33 106
10 pspk 37 35 23 35 21 114
11 mahesh 34 29 17 22 9 77
12 ntr 32 23 40 22 23 108
13 ramcharan 31 2 26 40 42 110
In [52]:
bins=[70,90,110,150]
group_names=['fail','average','good']
df['status']= pd.cut(df["TOTAL"],bins,labels=group_names)
df
Out[52]:
1 Venky 24 23 23 5 35 86 fail
4 Lakshman 21 29 10 26 29 94 average
6 vijay 27 2 23 37 25 87 fail
11 mahesh 34 29 17 22 9 77 fail
In [ ]: