Data Analysis Project
Data Analysis Project
In [4]: df.head()
Out[4]:
2022 2020 2015 2010 2000 1990
Rank Country/Territory Continent
Population Population Population Population Population Population
North
2 3 United States 3.382899e+08 3.359420e+08 3.246078e+08 3.111828e+08 2.823986e+08 2.480837e+08 2
America
In [51]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Rank 234 non-null int64
1 Country/Territory 234 non-null object
2 Continent 234 non-null object
3 2022 Population 234 non-null float64
4 2020 Population 234 non-null float64
5 2015 Population 234 non-null float64
6 2010 Population 234 non-null float64
7 2000 Population 234 non-null float64
8 1990 Population 234 non-null float64
9 1980 Population 234 non-null float64
10 1970 Population 234 non-null float64
11 Area (km?) 234 non-null int64
12 Density (per km?) 234 non-null float64
13 Growth Rate 234 non-null float64
14 World Population Percentage 234 non-null float64
dtypes: float64(11), int64(2), object(2)
memory usage: 27.6+ KB
Rank 1.000000 -0.358361 -0.355854 -0.351222 -0.347461 -0.341057 -0.336152 -0.335246 -0.335379 -0.3837
2022
-0.358361 1.000000 0.999946 0.999490 0.998629 0.994605 0.987228 0.980285 0.973162 0.4534
Population
2020
-0.355854 0.999946 1.000000 0.999763 0.999105 0.995583 0.988724 0.982121 0.975254 0.4549
Population
2015
-0.351222 0.999490 0.999763 1.000000 0.999783 0.997340 0.991594 0.985724 0.979414 0.4582
Population
2010
-0.347461 0.998629 0.999105 0.999783 1.000000 0.998593 0.993929 0.988786 0.983042 0.4619
Population
2000
-0.341057 0.994605 0.995583 0.997340 0.998593 1.000000 0.998336 0.995160 0.990956 0.4739
Population
1990
-0.336152 0.987228 0.988724 0.991594 0.993929 0.998336 1.000000 0.999042 0.996602 0.4867
Population
1980
-0.335246 0.980285 0.982121 0.985724 0.988786 0.995160 0.999042 1.000000 0.999194 0.4981
Population
1970
-0.335379 0.973162 0.975254 0.979414 0.983042 0.990956 0.996602 0.999194 1.000000 0.5099
Population
Area (km?) -0.383774 0.453411 0.454993 0.458240 0.461936 0.473933 0.486764 0.498166 0.509940 1.0000
Density
0.129436 -0.027618 -0.027358 -0.026857 -0.026505 -0.026139 -0.026224 -0.026587 -0.026881 -0.0631
(per km?)
2022 2020 2015 2010 2000 1990 1980 1970 Ar
Rank
Population Population Population Population Population Population Population Population (km
Growth
-0.224561 -0.020863 -0.025116 -0.032154 -0.037983 -0.050515 -0.062397 -0.072349 -0.081313 -0.0139
Rate
World
Population -0.358464 0.999999 0.999944 0.999487 0.998626 0.994598 0.987218 0.980273 0.973150 0.4532
Percentage
In [9]: continent_wise
Out[9]: Continent
Africa 57
Asia 50
Europe 50
North America 40
Oceania 23
South America 14
Name: Country/Territory, dtype: int64
In [16]: continents=continent_wise.index
counts=continent_wise.values
sns.set_style('whitegrid')
plt.pie(counts,labels=continents,shadow=True,startangle=90,colors=sns.color_palette('viridis'),autopct=lambda p:f'{in
plt.title('Continents wise Countries',fontsize=20)
plt.grid(True)
plt.show()
In [9]: continent_wise_pop
Out[9]: Continent
Africa 1.426731e+09
Asia 4.721383e+09
Europe 7.431475e+08
North America 6.002961e+08
Oceania 4.503855e+07
South America 4.368166e+08
Name: 2022 Population , dtype: float64
In [10]: continents=continent_wise_pop.index
population=continent_wise_pop.values
sns.set_style('whitegrid')
explode=[0,0,0,0,1,0]
plt.pie(population,labels=continents,explode=explode,autopct='%1.1f%%',shadow=True)
plt.title('continent wise population percentage',fontsize=20)
#plt.lengend()
plt.show()
In [17]: #lets plot world population over the years
Total_Population=df[[' 2022 Population ',' 2020 Population ',' 2015 Population ',' 2010 Population ',' 2000 Populatio
Total_Population
Out[17]: 2022 Population 7.973413e+09
2020 Population 7.839251e+09
2015 Population 7.424810e+09
2010 Population 6.983785e+09
2000 Population 6.147056e+09
1990 Population 5.314192e+09
1980 Population 4.442400e+09
1970 Population 3.694137e+09
dtype: float64
In [18]: plt.figure(figsize=(15,5))
sns.set_style('whitegrid')
x=Total_Population.values
sns.lineplot(Total_Population,marker="o",ms=7,label=x)
plt.grid(ls='dotted')
plt.title('Total Population over years',fontsize=20)
plt.ylabel('In Billions')
plt.legend(title='Population',fancybox=False)
plt.xlabel('Year')
plt.show()
In [19]: years_population_continent_wise=df.groupby(['Continent'])[[' 2022 Population ',' 2020 Population ',' 2015 Population
In [20]: dff=years_population_continent_wise.transpose()
dff
Out[20]: Continent Asia Africa Europe North America South America Oceania
In [21]: plt.figure(figsize=(15,6))
sns.lineplot(dff,marker='*',ms=10)
plt.xlabel('Year')
plt.ylabel('In Billions')
plt.grid(ls='dotted')
plt.title('Continental Population over years',fontsize=20)
plt.show()
In [87]: header_values=years_population_continent_wise.columns
In [88]: header_values
Out[88]: Index([' 2022 Population ', ' 2020 Population ', ' 2015 Population ',
' 2010 Population ', ' 2000 Population ', ' 1990 Population ',
' 1980 Population ', ' 1970 Population '],
dtype='object')
In [22]: #Melt the data as per requirement
df_melted=df.melt(id_vars=['Continent'],value_vars=[' 2022 Population ',' 2020 Population ',' 2015 Population ',' 201
In [17]: df_melted
In [34]: #population_by_continent=df_melted.groupby(['Continent'])['Year'].sum().reset_index()
In [35]: #population_by_continent
1.4B
1.2B
1B
Population
0.8B
0.6B
0.4B
0.2B
In [25]: selected_columns=[' 2022 Population ',' 2020 Population ',' 2015 Population ',' 2010 Population ',' 2000 Population
total_population=df[selected_columns].sum()
In [26]: a=total_population.index.str.replace('Population','')
b=total_population.values
In [27]: plt.figure(figsize=(15,5))
plt.bar(a,b,label=b,color=sns.color_palette('inferno'),width=0.5)
plt.legend(title='Population ')
plt.xlabel('YEAR')
plt.ylabel('POPULATION')
plt.title('YEAR WISE WORLD POPULATION',fontsize=20)
#plt.ylim(0,7)
plt.show()
In [28]: Top_10_by_growthrate=df.sort_values(by='Growth Rate',ascending=False).head(10)
Low_Population_growth_countries=df.sort_values(by='Growth Rate',ascending=False).tail(10)
In [29]: Top_10_by_growthrate
Out[29]:
2022 2020 2015 2010 2000 1990 1980
Rank Country/Territory Continent
Population Population Population Population Population Population Population
134 135 Moldova Europe 3272996.0 3084847.0 3277388.0 3678186.0 4251573.0 4480199.0 4103240.0
115 116 Slovakia Europe 5643453.0 5456681.0 5424444.0 5396424.0 5376690.0 5261305.0 4973883.0
181 182 Mayotte Africa 326101.0 305587.0 249545.0 211786.0 159215.0 92659.0 52233.0
In [30]: x=Top_10_by_growthrate['Country/Territory']
y=Top_10_by_growthrate['Growth Rate']
plt.figure(figsize=(12,5))
plt.bar(x,y,width=0.5,label=y,color=sns.color_palette('plasma'))
plt.legend(loc=1,title='Growth rate')
plt.title('HIGHESH GROWING COUNTRIES IN THE WORLD',fontsize=20)
plt.ylim(1,1.10)
plt.ylabel('GROWTH RATE')
plt.xlabel('COUNTRIES')
plt.show()
129 130 Croatia Europe 4030358 4096868 4254815 4368682 4548434 4873707 4680144
104 105 Serbia Europe 7221365 7358005 7519496 7653748 7935022 7987529 7777010
214 215 Marshall Islands Oceania 41569 43413 49410 53416 54224 46047 31988
Bosnia and
136 137 Europe 3233526 3318407 3524324 3811088 4179350 4494310 4199820
Herzegovina
150 151 Latvia Europe 1850651 1897052 1991955 2101530 2392530 2689391 2572037
140 141 Lithuania Europe 2750055 2820267 2963765 3139019 3599637 3785847 3521206
107 108 Bulgaria Europe 6781953 6979175 7309253 7592273 8097691 8767778 8980606
212 213 American Samoa Oceania 44273 46189 51368 54849 58230 47818 32886
118 119 Lebanon Asia 5489739 5662923 6398940 4995800 4320642 3593700 2963702
In [46]: x=Low_Population_growth_countries['Country/Territory']
y=Low_Population_growth_countries['Growth Rate']
plt.figure(figsize=(16,5))
plt.bar(x,y,label=y,color=sns.color_palette('magma'))
plt.title('SLOWEST GROWING COUNTRIES',fontsize=20)
plt.legend(title='Growth Rate')
plt.xlabel('COUNTRY')
plt.ylabel('GROWTH RATE')
plt.ylim(0,1.2)
plt.xlim(-0.5,10)
plt.show()
In [30]: df.columns
In [34]: Top_10_Populated_countries_in_1970
Out[34]: Country/Territory
China 822534450.0
India 557501301.0
United States 200328340.0
Russia 130093010.0
Indonesia 115228394.0
Japan 105416839.0
Brazil 96369875.0
Germany 78294583.0
Bangladesh 67541860.0
Pakistan 59290872.0
Name: 1970 Population , dtype: float64
In [41]: x=Top_10_Populated_countries_in_1970.index
y=Top_10_Populated_countries_in_1970.values
plt.figure(figsize=(12,5))
plt.bar(x,y,label=y,color=sns.color_palette('magma'))
plt.title('MOST POPULATED COUNTRIES in 1970',fontsize=20)
plt.legend(title='POPULATION')
plt.xlabel('COUNTRY')
plt.ylabel('In Billions')
plt.show()
In [38]: Top_10_Populated_countries_in_2022=df.groupby('Country/Territory')[' 2022 Population '].sum().sort_values(ascending=F
In [47]: x=Top_10_Populated_countries_in_2022.index
y=Top_10_Populated_countries_in_2022.values
plt.figure(figsize=(12,6))
#c=sns.color_palette('cividis')
plt.bar(x,y,label=y,color=sns.color_palette('cividis'))
plt.title('MOST POPULATED COUNTRIES in 2022',fontsize=20)
plt.xlabel('COUNTRY')
plt.ylabel('In Billions')
plt.legend(title='Population')
plt.show()
In [69]: df[df['Country/Territory']=='Vatican City']
Out[69]:
2022 2020 2015 2010 2000 1990 1980
Rank Country/Territory Continent
Population Population Population Population Population Population Population
233 234 Vatican City Europe 510 520 564 596 651 700 733
Out[44]: Country/Territory
Russia 17098242
Canada 9984670
China 9706961
United States 9372610
Brazil 8515767
Australia 7692024
India 3287590
Argentina 2780400
Kazakhstan 2724900
Algeria 2381741
Name: Area (km?), dtype: int64
In [49]: plt.figure(figsize=(10,6))
#sns.despine(Area)
x=Area.index
y=Area.values
plt.bar(x,y,label=y,color=sns.color_palette('Greens'))
plt.title('LARGEST COUNTRIES IN THE WORLD',fontsize=20)
plt.legend(title='AREA')
plt.xlabel('COUNTRY')
plt.ylabel('Square Kilometers')
#plt.legend(labels=[17098242, 9984670,9706961,9372610,8515767,7692024,3287590,2780400,2724900,2381741])
plt.show()
In [48]: df.columns
In [106… densely_Populated
Out[106… Country/Territory
Macau 23172.2667
Monaco 18234.5000
Singapore 8416.4634
Hong Kong 6783.3922
Gibraltar 5441.5000
Bahrain 1924.4876
Maldives 1745.9567
Malta 1687.6139
Sint Maarten 1299.2647
Bermuda 1188.5926
Name: Density (per km?), dtype: float64
In [145… plt.figure(figsize=(10,6))
b=densely_Populated.values
a=densely_Populated.index
plt.bar(a,b,label=b,color=sns.color_palette('husl'))
plt.legend(title='Density (per km?)')
plt.ylabel('DENSITY')
plt.xlabel('COUNTRY')
plt.title('MOST DENSELY POPULATED COUNTRIES')
plt.show()
In [155… b=densely_Populated.values
a=densely_Populated.index
fig=go.Figure(data=[go.Bar(x=a,y=b)])
fig.update_layout(title='bar')
bar
20k
15k
10k
5k
In [147… low_densely_Populated=df.groupby('Country/Territory')['Density (per km?)'].sum().sort_values().head(10)
In [148… low_densely_Populated
Out[148… Country/Territory
Greenland 0.0261
Falkland Islands 0.3105
Western Sahara 2.1654
Mongolia 2.1727
Namibia 3.1092
Australia 3.4032
Iceland 3.6204
French Guiana 3.6459
Guyana 3.7621
Suriname 3.7727
Name: Density (per km?), dtype: float64
In [153… plt.figure(figsize=(14,4))
x=low_densely_Populated.index
y=low_densely_Populated.values
plt.bar(x,y,label=y,color=sns.color_palette('viridis'))
plt.title('LOW DENSELY POPULATED COUNTRIES')
plt.legend(title='Density (per km?)')
plt.xlabel('DENSITY')
plt.ylabel('COUNTRY')
#plt.xlim(-1,10)
plt.show()
In [21]: features=[' 1970 Population ' ,' 2020 Population ']
for feature in features:
fig = px.choropleth(df,
locations='Country/Territory',
locationmode='country names',
color=feature,
hover_name='Country/Territory',
template='plotly_white',
title = feature)
fig.show()
1970 Population
2020 Population
PROJECT NO - 02 : SUPERMART GROCERY SALES - RETAIL ANALYSIS
In [7]: df.head()
Health 2017-
1 OD2 Sudha Beverages Krishnagiri
Drinks 11-08
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Order ID 9994 non-null object
1 Customer Name 9994 non-null object
2 Category 9994 non-null object
3 Sub Category 9994 non-null object
4 City 9994 non-null object
5 Order Date 9994 non-null object
6 Region 9994 non-null object
7 Sales 9994 non-null int64
8 Discount 9994 non-null float64
9 Profit 9994 non-null float64
10 State 9994 non-null object
dtypes: float64(2), int64(1), object(8)
memory usage: 859.0+ KB
In [5]: df.isnull().sum()
Out[5]: Order ID 0
Customer Name 0
Category 0
Sub Category 0
City 0
Order Date 0
Region 0
Sales 0
Discount 0
Profit 0
State 0
dtype: int64
order
0.000179 0.003022 0.010742 1.000000 -0.033562 -0
day
order
-0.009518 0.002068 0.003184 -0.033562 1.000000 -0
month
order
0.007542 -0.018778 -0.006401 -0.017458 -0.020183 1
year
In [7]: sales_by_category
Out[7]: Category
Eggs, Meat & Fish 2267401
Snacks 2237546
Food Grains 2115272
Bakery 2112281
Fruits & Veggies 2100727
Beverages 2085313
Oil & Masala 2038442
Name: Sales, dtype: int64
In [103… x=sales_by_category.index
y=sales_by_category.values
plt.figure(figsize=(15,6))
plt.bar(x,y,label=y,color=sns.color_palette('plasma'),width=
plt.legend(title='Sales',loc=0)
plt.title('CATEGORY WISE SALES',fontsize=20)
plt.grid(axis='y')
plt.xlabel('CATEGORY',fontsize=14)
plt.ylim(0,3390000)
plt.ylabel("SALES",fontsize=14)
plt.show()
In [7]: city_wise_sales=df.groupby('City')['Sales'].sum().sort_value
In [8]: city_wise_sales
Out[8]: City
Trichy 541403
Nagercoil 551435
Dharmapuri 571553
Dindigul 575631
Theni 579553
Viluppuram 581274
Namakkal 598530
Ooty 599292
Virudhunagar 606820
Madurai 617836
Name: Sales, dtype: int64
In [78]: x=city.index
y=city.values
c=sns.color_palette('inferno')
plt.figure(figsize=(15,6))
plt.barh(x,y,label=y,color=sns.color_palette('inferno'))
plt.title('Top cities with high Sales',fontsize=20)
plt.xlabel('CITY',fontsize=14)
plt.ylabel('SALES',fontsize=14)
plt.grid(axis='x')
plt.xlim(0,900000)
plt.legend(title='Sales',loc=0)
#plt.xticks(rotation=45)
plt.show()
In [5]: df['Order Date']=pd.to_datetime(df['Order Date'])
C:\Users\user\AppData\Local\Temp\ipykernel_6768\1722919639.p
y:1: UserWarning: Could not infer format, so each element wil
l be parsed individually, falling back to `dateutil`. To ensu
re parsing is consistent and as-expected, please specify a fo
rmat.
df['Order Date']=pd.to_datetime(df['Order Date'])
In [146… x=month_sales.index
y=month_sales.values
explode=[0,0,0,0,0,0,0,0,0,0,0,0]
plt.pie(y,labels=x,explode=explode,shadow=True,autopct='%1.1
plt.title('MONTH WISE SALES')
plt.show()
In [ ]: x=month_sales.index
y=month_sales.values
plt.bar
In [24]: x=year_sales.index
y=year_sales.values
plt.pie(y,labels=x,shadow=True,autopct='%1.1f%%',colors=sns.
plt.title('YEAR WISE SALES')
plt.show()
In [4]: x=sales_by_product.index
y=sales_by_product.values
plt.figure(figsize=(15,6))
plt.bar(x,y,label=y,color=sns.color_palette('plasma'),width=
plt.legend(title='Sales',loc=0)
plt.title('sales by product',fontsize=20)
plt.grid(axis='y')
plt.xlabel('PRODUCT',fontsize=14)
plt.ylim(0,1750000)
plt.ylabel("SALES",fontsize=14)
plt.show()
In [8]: df.groupby('Customer Name')['Profit'].sum().sort_values(asce
In [9]: le=LabelEncoder()
In [22]: target=df['Sales']
In [27]: y_pred=lr.predict(x_test)
In [24]: x_train,x_test,y_train,y_test=train_test_split(features,targ
In [31]: mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
In [25]: scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)
In [26]: lr=LinearRegression()
lr.fit(x_train,y_train)
Out[26]: ▾ LinearRegression i ?
LinearRegression()
In [32]: print(mse,r2)
2199.6894915749735 0.9933977767041029
In [11]: df.corr(numeric_only=True)
Out[11]: order order
Sales Discount Profit
day month
order
0.000179 0.003022 0.010742 1.000000 -0.033562 -0
day
order
-0.009518 0.002068 0.003184 -0.033562 1.000000 -0
month
order
0.007542 -0.018778 -0.006401 -0.017458 -0.020183 1
year
In [36]: plt.scatter(y_test,y_pred)
plt.plot([min(y_test), max(y_test)], [min(y_test),
max(y_test)], color='red')
In [8]: df.columns
In [ ]: