Data Visualization With Python - Matplotlib and Seaborn
Data Visualization With Python - Matplotlib and Seaborn
# If you don't run this code line, you will notice that the xlabel and ylabel␣
↪on any plot is black on black and it will be hard to see them.
plt.ylabel('Price [$]')
1
plt.title('My first ploting exercice')
#plt.legend(loc = 'upper right')
plt.grid()
MINI CHALLLENGE #1: - Plot similar kind of graph for NFLX - Change the line color to red
and increase the line width
[ ]:
plt.ylabel('Price [$]')
plt.title('My first ploting exercice')
#plt.legend(loc = 'upper right')
plt.grid()
2
[6]: stock_df.plot(x = 'Date', y = 'TWTR', label = 'Twiter Stock Price', figsize=␣
↪(15, 10), linewidth = 5, color = 'green')
plt.ylabel('Price [$]')
plt.title('My first ploting exercice')
#plt.legend(loc = 'upper right')
plt.grid()
3
2 2. PLOT SCATTERPLOT
[7]: # Read daily return data using pandas
daily_return_df = pd.read_csv('stocks_daily_returns.csv')
daily_return_df
4
[9]: X = daily_return_df['FB']
X
[9]: 0 0.000000
1 -0.063082
2 -2.798229
3 0.887446
4 4.505467
…
1707 2.444881
1708 -0.743467
1709 1.640390
1710 3.474701
1711 8.222348
Name: FB, Length: 1712, dtype: float64
[10]: Y = daily_return_df['TWTR']
Y
[10]: 0 0.000000
1 -7.238307
2 3.001200
3 -2.331002
4 1.670635
…
1707 0.179995
1708 0.770018
1709 3.132970
1710 0.148177
1711 1.307036
Name: TWTR, Length: 1712, dtype: float64
5
MINI CHALLLENGE #2: - Plot similar kind of graph for Facebook and Netflix
[8]: X = daily_return_df['FB']
X
[8]: 0 0.000000
1 -0.063082
2 -2.798229
3 0.887446
4 4.505467
…
1707 2.444881
1708 -0.743467
1709 1.640390
1710 3.474701
1711 8.222348
Name: FB, Length: 1712, dtype: float64
[9]: Y = daily_return_df['NFLX']
Y
[9]: 0 0.000000
1 2.459768
2 0.898778
6
3 -1.237020
4 0.464452
…
1707 2.759374
1708 -1.122715
1709 -0.710934
1710 0.362102
1711 11.608717
Name: NFLX, Length: 1712, dtype: float64
7
# Use matplotlib to plot a pie chart
plt.figure(figsize = (10, 10))
plt.pie(values, colors = colors, labels = labels, explode = explode)
plt.title('STOCK PORTFOLIO')
MINI CHALLENGE #3: - Plot the pie chart for the same stocks assuming equal allocation -
Explode Amazon and Google slices
[5]: values = [20, 20, 20, 20, 20]
colors = ['g', 'r', 'y', 'b', 'm']
labels =['AAPL', 'GOOG', 'T', 'TSLA', 'AMZN']
explode = [0, 0.2, 0, 0, 0.2]
8
plt.figure(figsize = (10, 10))
plt.pie(values, colors = colors, labels = labels, explode = explode)
plt.title('STOCK PORTFOLIO')
[ ]:
[ ]:
9
4 4. PLOT HISTOGRAMS
[14]: daily_return_df = pd.read_csv('stocks_daily_returns.csv')
daily_return_df
new_equals = daily_return_df['FB'].mean()
sigma = daily_return_df['FB'].std()
10
MINI CHALLENGE #4: - Plot the histogram for TWITTER returns using 30 bins
[19]: new_equals = daily_return_df['TWTR'].mean()
sigma = daily_return_df['TWTR'].std()
11
5 5. PLOT MULTIPLE PLOTS
[22]: stock_df
12
MINI CHALLLENGE #5: - Plot a similar graph containing prices of Netflix, Twitter and Facebook
- Add legend indicating all the stocks - Place the legend in the “upper center” location
[35]: stock_df.plot(x = 'Date', y = ['NFLX', 'TWTR', 'FB'], figsize = (18, 10),␣
↪linewidth = 4)
plt.ylabel('Price [$]')
plt.title('Stock Prices')
plt.legend(loc = 'upper center')
plt.grid()
13
6 6. PLOT SUBPLOTS
[38]: plt.figure(figsize= (20, 10))
plt.subplot(1, 2, 1)
plt.plot(stock_df['NFLX'], color = 'red', linewidth = 4)
plt.grid()
plt.subplot(1, 2, 2)
plt.plot(stock_df['FB'], color = 'blue', linewidth = 4)
plt.grid()
14
[39]: plt.figure(figsize= (20, 10))
plt.subplot(2, 1, 1)
plt.plot(stock_df['NFLX'], color = 'red', linewidth = 4)
plt.grid()
plt.subplot(2, 1, 2)
plt.plot(stock_df['FB'], color = 'blue', linewidth = 4)
plt.grid()
15
MINI CHALLLENGE #6: - Create subplots like above for Twitter, Facebook and Netflix
[42]: plt.figure(figsize= (17, 17))
plt.subplot(3, 1, 1)
plt.plot(stock_df['NFLX'], color = 'red', linewidth = 4)
plt.grid()
plt.subplot(3, 1, 2)
plt.plot(stock_df['FB'], color = 'blue', linewidth = 4)
plt.grid()
plt.subplot(3, 1, 3)
plt.plot(stock_df['TWTR'], color = 'green', linewidth = 4)
plt.grid()
16
7 7. PLOT 3D PLOTS
[43]: # Toolkits are collections of application-specific functions that extend␣
↪Matplotlib.
17
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [5, 6, 2, 3, 13, 4, 1, 2, 4, 8]
z = [2, 3, 3, 3, 5, 7, 9, 11, 9, 10]
ax.scatter(x, y, z, c = 'b')
ax.set_xlabel('X label')
ax.set_ylabel('Y label')
ax.set_zlabel('Z label')
MINI CHALLLENGE #7: - Create a 3D plot with daily return values of Twitter, Facebook and
18
Netflix
[47]: daily_return_df
x = daily_return_df['TWTR']
y = daily_return_df['FB']
z = daily_return_df['NFLX']
19
8 8. SEABRON SCATTERPLOT & COUNTPLOT
[11]: # Seaborn is a visualization library that sits on top of matplotlib
# Seaborn offers enhanced features compared to matplotlib
# https://seaborn.pydata.org/examples/index.html
# import libraries
import seaborn as sns # Statistical data visualization
20
cancer
21
Attributes: 30 numeric, predictive attributes and the class\n\n :Attribute
Information:\n - radius (mean of distances from center to points on the
perimeter)\n - texture (standard deviation of gray-scale values)\n
- perimeter\n - area\n - smoothness (local variation in radius
lengths)\n - compactness (perimeter^2 / area - 1.0)\n - concavity
(severity of concave portions of the contour)\n - concave points (number
of concave portions of the contour)\n - symmetry\n - fractal
dimension ("coastline approximation" - 1)\n\n The mean, standard error,
and "worst" or largest (mean of the three\n worst/largest values) of
these features were computed for each image,\n resulting in 30 features.
For instance, field 0 is Mean Radius, field\n 10 is Radius SE, field 20
is Worst Radius.\n\n - class:\n - WDBC-Malignant\n
- WDBC-Benign\n\n :Summary Statistics:\n\n
===================================== ====== ======\n
Min Max\n ===================================== ====== ======\n radius
(mean): 6.981 28.11\n texture (mean):
9.71 39.28\n perimeter (mean): 43.79 188.5\n area
(mean): 143.5 2501.0\n smoothness (mean):
0.053 0.163\n compactness (mean): 0.019 0.345\n
concavity (mean): 0.0 0.427\n concave points (mean):
0.0 0.201\n symmetry (mean): 0.106 0.304\n
fractal dimension (mean): 0.05 0.097\n radius (standard error):
0.112 2.873\n texture (standard error): 0.36 4.885\n
perimeter (standard error): 0.757 21.98\n area (standard error):
6.802 542.2\n smoothness (standard error): 0.002 0.031\n
compactness (standard error): 0.002 0.135\n concavity (standard
error): 0.0 0.396\n concave points (standard error): 0.0
0.053\n symmetry (standard error): 0.008 0.079\n fractal
dimension (standard error): 0.001 0.03\n radius (worst):
7.93 36.04\n texture (worst): 12.02 49.54\n
perimeter (worst): 50.41 251.2\n area (worst):
185.2 4254.0\n smoothness (worst): 0.071 0.223\n
compactness (worst): 0.027 1.058\n concavity (worst):
0.0 1.252\n concave points (worst): 0.0 0.291\n
symmetry (worst): 0.156 0.664\n fractal dimension
(worst): 0.055 0.208\n =====================================
====== ======\n\n :Missing Attribute Values: None\n\n :Class Distribution:
212 - Malignant, 357 - Benign\n\n :Creator: Dr. William H. Wolberg, W. Nick
Street, Olvi L. Mangasarian\n\n :Donor: Nick Street\n\n :Date: November,
1995\n\nThis is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic)
datasets.\nhttps://goo.gl/U2Uwz2\n\nFeatures are computed from a digitized image
of a fine needle\naspirate (FNA) of a breast mass. They
describe\ncharacteristics of the cell nuclei present in the image.\n\nSeparating
plane described above was obtained using\nMultisurface Method-Tree (MSM-T) [K.
P. Bennett, "Decision Tree\nConstruction Via Linear Programming." Proceedings of
the 4th\nMidwest Artificial Intelligence and Cognitive Science Society,\npp.
97-101, 1992], a classification method which uses linear\nprogramming to
22
construct a decision tree. Relevant features\nwere selected using an exhaustive
search in the space of 1-4\nfeatures and 1-3 separating planes.\n\nThe actual
linear program used to obtain the separating plane\nin the 3-dimensional space
is that described in:\n[K. P. Bennett and O. L. Mangasarian: "Robust
Linear\nProgramming Discrimination of Two Linearly Inseparable
Sets",\nOptimization Methods and Software 1, 1992, 23-34].\n\nThis database is
also available through the UW CS ftp server:\n\nftp ftp.cs.wisc.edu\ncd math-
prog/cpo-dataset/machine-learn/WDBC/\n\n.. topic:: References\n\n - W.N.
Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n for
breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n
Electronic Imaging: Science and Technology, volume 1905, pages 861-870,\n
San Jose, CA, 1993.\n - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast
cancer diagnosis and \n prognosis via linear programming. Operations
Research, 43(4), pages 570-577, \n July-August 1995.\n - W.H. Wolberg,
W.N. Street, and O.L. Mangasarian. Machine learning techniques\n to diagnose
breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n
163-171.',
'feature_names': array(['mean radius', 'mean texture', 'mean perimeter', 'mean
area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension'], dtype='<U23'),
'filename': 'breast_cancer.csv',
'data_module': 'sklearn.datasets.data'}
[7]: mean radius mean texture mean perimeter mean area mean smoothness \
0 17.99 10.38 122.80 1001.0 0.11840
1 20.57 17.77 132.90 1326.0 0.08474
2 19.69 21.25 130.00 1203.0 0.10960
3 11.42 20.38 77.58 386.1 0.14250
4 20.29 14.34 135.10 1297.0 0.10030
.. … … … … …
564 21.56 22.39 142.00 1479.0 0.11100
565 20.13 28.25 131.20 1261.0 0.09780
23
566 16.60 28.08 108.30 858.1 0.08455
567 20.60 29.33 140.10 1265.0 0.11780
568 7.76 24.54 47.92 181.0 0.05263
24
3 0.2575 0.6638 0.17300 0.0
4 0.1625 0.2364 0.07678 0.0
.. … … … …
564 0.2216 0.2060 0.07115 0.0
565 0.1628 0.2572 0.06637 0.0
566 0.1418 0.2218 0.07820 0.0
567 0.2650 0.4087 0.12400 0.0
568 0.0000 0.2871 0.07039 1.0
[8]: mean radius mean texture mean perimeter mean area mean smoothness \
562 15.22 30.62 103.40 716.9 0.10480
563 20.92 25.09 143.00 1347.0 0.10990
564 21.56 22.39 142.00 1479.0 0.11100
565 20.13 28.25 131.20 1261.0 0.09780
566 16.60 28.08 108.30 858.1 0.08455
567 20.60 29.33 140.10 1265.0 0.11780
568 7.76 24.54 47.92 181.0 0.05263
25
567 0.16500 0.86810 0.9387
568 0.08996 0.06444 0.0000
[7 rows x 31 columns]
[12]: # Plot scatter plot between mean area and mean smoothness
plt.figure(figsize = (10,10))
sns.scatterplot(x = 'mean area', y = 'mean smoothness', hue= 'target', data =␣
↪df_cancer)
26
[20]: # Let's print out countplot to know how many samples belong to class #0 and #1
plt.figure(figsize = (10,10))
sns.countplot(data = df_cancer, x= 'target', hue = 'target')
27
MINI CHALLENGE #8: - Plot the scatterplot between the mean radius and mean area. Comment
on the plot
[22]: sns.scatterplot(x = 'mean radius', y = 'mean area', hue = 'target', data =␣
↪df_cancer)
28
9 9. SEABORN PAIRPLOT, DISPLOT, AND
HEATMAPS/CORRELATIONS
[23]: # Plot the pairplot
sns.pairplot(df_cancer, hue = 'target', vars = ['mean radius', 'mean texture',␣
↪'mean area', 'mean perimeter', 'mean smoothness'])
29
[24]: # Strong correlation between the mean radius and mean perimeter, mean area and␣
↪mean primeter
30
[25]: # plot the distplot
# Displot combines matplotlib histogram function with kdeplot() (Kernel density␣
↪estimate)
31
MINI CHALLENGE #9: - Plot two separate distplot for each target class #0 and target class #1
[31]: class_0_df = df_cancer[ df_cancer['target'] == 0]
class_1_df = df_cancer[ df_cancer['target'] == 1]
class_0_df
class_1_df
[31]: mean radius mean texture mean perimeter mean area mean smoothness \
19 13.540 14.36 87.46 566.3 0.09779
20 13.080 15.71 85.63 520.0 0.10750
21 9.504 12.44 60.34 273.9 0.10240
37 13.030 18.42 82.61 523.8 0.08983
46 8.196 16.84 51.71 201.9 0.08600
.. … … … … …
558 14.590 22.68 96.39 657.1 0.08473
559 11.510 23.93 74.52 403.5 0.09261
560 14.050 27.15 91.38 600.4 0.09929
561 11.200 29.37 70.67 386.0 0.07449
568 7.760 24.54 47.92 181.0 0.05263
32
mean compactness mean concavity mean concave points mean symmetry \
19 0.08129 0.06664 0.047810 0.1885
20 0.12700 0.04568 0.031100 0.1967
21 0.06492 0.02956 0.020760 0.1815
37 0.03766 0.02562 0.029230 0.1467
46 0.05943 0.01588 0.005917 0.1769
.. … … … …
558 0.13300 0.10290 0.037360 0.1454
559 0.10210 0.11120 0.041050 0.1388
560 0.11260 0.04462 0.043040 0.1537
561 0.03558 0.00000 0.000000 0.1060
568 0.04362 0.00000 0.000000 0.1587
33
558 0.11050 0.2258 0.08004 1.0
559 0.09653 0.2112 0.08732 1.0
560 0.10480 0.2250 0.08321 1.0
561 0.00000 0.1566 0.05905 1.0
568 0.00000 0.2871 0.07039 1.0
[32]: class_0_df
[32]: mean radius mean texture mean perimeter mean area mean smoothness \
0 17.99 10.38 122.80 1001.0 0.11840
1 20.57 17.77 132.90 1326.0 0.08474
2 19.69 21.25 130.00 1203.0 0.10960
3 11.42 20.38 77.58 386.1 0.14250
4 20.29 14.34 135.10 1297.0 0.10030
.. … … … … …
563 20.92 25.09 143.00 1347.0 0.10990
564 21.56 22.39 142.00 1479.0 0.11100
565 20.13 28.25 131.20 1261.0 0.09780
566 16.60 28.08 108.30 858.1 0.08455
567 20.60 29.33 140.10 1265.0 0.11780
34
567 0.07016 … 39.42 184.60 1821.0
[33]: class_1_df
[33]: mean radius mean texture mean perimeter mean area mean smoothness \
19 13.540 14.36 87.46 566.3 0.09779
20 13.080 15.71 85.63 520.0 0.10750
21 9.504 12.44 60.34 273.9 0.10240
37 13.030 18.42 82.61 523.8 0.08983
46 8.196 16.84 51.71 201.9 0.08600
.. … … … … …
558 14.590 22.68 96.39 657.1 0.08473
559 11.510 23.93 74.52 403.5 0.09261
560 14.050 27.15 91.38 600.4 0.09929
561 11.200 29.37 70.67 386.0 0.07449
568 7.760 24.54 47.92 181.0 0.05263
35
20 0.12700 0.04568 0.031100 0.1967
21 0.06492 0.02956 0.020760 0.1815
37 0.03766 0.02562 0.029230 0.1467
46 0.05943 0.01588 0.005917 0.1769
.. … … … …
558 0.13300 0.10290 0.037360 0.1454
559 0.10210 0.11120 0.041050 0.1388
560 0.11260 0.04462 0.043040 0.1537
561 0.03558 0.00000 0.000000 0.1060
568 0.04362 0.00000 0.000000 0.1587
36
561 0.00000 0.1566 0.05905 1.0
568 0.00000 0.2871 0.07039 1.0
10 EXCELLENT JOB!
plt.ylabel('Price')
plt.title('My first plotting exercise!')
37
plt.legend(loc = "upper left")
plt.grid()
MINI CHALLENGE #2 SOLUTIONS: - Plot similar kind of graph for Facebook and Netflix
[ ]: X = daily_return_df['FB']
Y = daily_return_df['NFLX']
plt.figure(figsize = (15, 10))
plt.grid()
plt.scatter(X, Y);
MINI CHALLLENGE #3 SOLUTIONS: - Plot the pie chart for the same stocks assuming equal
allocation - Explode Amazon and Google slices
[ ]: values = [20, 20, 20, 20, 20]
colors = ['g', 'r', 'y', 'b', 'm']
explode = [0, 0.2, 0, 0, 0.2]
labels = ['AAPL', 'GOOG', 'T', 'TSLA ', 'AMZN']
plt.figure(figsize = (10, 10))
plt.pie(values, colors = colors, labels = labels, explode = explode)
plt.title('STOCK PORTFOLIO')
plt.show()
MINI CHALLLENGE #4 SOLUTIONS: - Plot the histogram for TWITTER returns with 30 bins
[ ]: num_bins = 30
plt.figure(figsize = (10,7))
plt.hist(daily_return_df['TWTR'], num_bins, facecolor = 'blue');
plt.grid()
MINI CHALLENGE #5 SOLUTION: - Plot a similar graph containing prices of Netflix, Twitter
and Facebook - Add legend indicating all the stocks - Place the legend in the “upper center” location
[ ]: stock_df.plot(x = 'Date', y = ['FB', 'TWTR', 'NFLX'], figsize = (15, 10),␣
↪linewidth = 3)
plt.ylabel('Price')
plt.title('Stock Prices')
plt.legend(loc="upper center")
plt.grid()
MINI CHALLLENGE #6 SOLUTION: - Create subplots like above for Twitter, Facebook and
Netflix
[ ]: plt.figure(figsize = (17,17))
plt.subplot(3, 1, 1)
plt.plot(stock_df.index, stock_df['FB'], 'r--');
plt.grid()
plt.legend(['Facebook price'])
38
plt.subplot(3, 1, 2)
plt.plot(stock_df.index, stock_df['TWTR'], 'b.');
plt.grid()
plt.legend(['Twitter price'])
plt.subplot(3, 1, 3)
plt.plot(stock_df.index, stock_df['NFLX'], 'y--');
plt.grid()
plt.legend(['Netflix price'])
MINI CHALLLENGE #7 SOLUTION: - Create a 3D plot with daily return values of Twitter,
Facebook and Netflix
[ ]: daily_return_df
x = daily_return_df['FB'].tolist()
y = daily_return_df['TWTR'].tolist()
z = daily_return_df['NFLX'].tolist()
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
MINI CHALLENGE #8 SOLUTION: - Plot the scatterplot between the mean radius and mean
area. Comment on the plot
[ ]: sns.scatterplot(x = 'mean radius', y = 'mean area', hue = 'target', data =␣
↪df_cancer);
MINI CHALLENGE #9 SOLUTION: - Plot two separate distplot for each target class #0 and
target class #1
[ ]: class_0_df = df_cancer[ df_cancer['target'] == 0];
class_1_df = df_cancer[ df_cancer['target'] == 1];
class_0_df
class_1_df
39
plt.figure(figsize=(10, 7));
sns.distplot(class_0_df['mean radius'], bins = 25, color = 'blue');
sns.distplot(class_1_df['mean radius'], bins = 25, color = 'red');
plt.grid();
12 APPENDIX
[ ]: # np.C_ class object translates slice objects to concatenation along the second␣
↪axis.
x1 = np.array([1, 2, 3])
x1.shape
x2 = np.array([4, 5, 6])
x2.shape
z = np.c_[x1, x2]
print(z)
print(z.shape)
40