Altair Basic
Altair Basic
1 Cài đặt
[ ]: import warnings
warnings.filterwarnings('ignore')
# ẩn đi warnings
1
/usr/local/lib/python3.10/dist-packages (from jinja2->altair) (3.0.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-
packages (from python-dateutil>=2.8.2->pandas>=0.18->altair) (1.16.0)
#Basic Chart
[ ]: import pandas as pd
data = pd.DataFrame({'a': list('CCCDDDEEE'),
'b': [2, 7, 4, 1, 2, 6, 8, 4, 7]})
[ ]: data
[ ]: a b
0 C 2
1 C 7
2 C 4
3 D 1
4 D 2
5 D 6
6 E 8
7 E 4
8 E 7
[ ]: alt.Chart(data).mark_point()
[ ]: alt.Chart(…)
[ ]: alt.Chart(data).mark_point().encode(
x='a'
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(data).mark_point().encode(
x='a',
y='b'
)
[ ]: alt.Chart(…)
2
2 Data Transformation: Aggregation
[ ]: alt.Chart(data).mark_point().encode(
x='a',
y='average(b)' # y là trung bình của b
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(data).mark_bar().encode(
x='a',
y='average(b)'
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(data).mark_bar().encode(
y='a',
x='average(b)'
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(…)
4 Example
[ ]: !pip install vega_datasets
3
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-
packages (from pandas->vega_datasets) (2024.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-
packages (from python-dateutil>=2.8.2->pandas->vega_datasets) (1.16.0)
source = data.cars()
alt.Chart(source).mark_circle(size=60).encode(
x='Horsepower',
y='Miles_per_Gallon',
color='Origin',
tooltip=['Name', 'Origin', 'Horsepower', 'Miles_per_Gallon']
).interactive()
[ ]: alt.Chart(…)
[ ]: source.head()
[ ]: alt.Chart(source).mark_circle(size=60).encode(
x='Weight_in_lbs',
y='Miles_per_Gallon',
color='Cylinders',
tooltip=['Name','Weight_in_lbs', 'Miles_per_Gallon']
).interactive()
[ ]: alt.Chart(…)
4
[ ]: alt.Chart(source).mark_circle(size=60).encode(
x='Cylinders',
y='Miles_per_Gallon',
color = 'Origin',
tooltip=['Name','Miles_per_Gallon', 'Origin']
)
[ ]: alt.Chart(…)
source = pd.DataFrame({
'a': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
'b': [28, 55, 43, 91, 81, 53, 19, 87, 52]
})
alt.Chart(source).mark_bar().encode(
x='a', y='b'
).interactive()
[ ]: alt.Chart(…)
[ ]:
alt.Chart(source).mark_rect().encode(
x='x:O',
y='y:O',
color='z:Q'
5
)
[ ]: alt.Chart(…)
[ ]: import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
# Compute x^2 + y^2 across a 2D grid
x, y = np.meshgrid(range(-50, 50), range(-50, 50))
z = x ** 2 + y ** 2
ax.plot_surface(x, y, z, cmap=plt.cm.YlGnBu_r)
plt.show()
source = data.movies.url
6
alt.Chart(source).mark_bar().encode(
alt.X("IMDB_Rating:Q", bin=True),
y='count()',
)
# Vẽ mark_bar() (biểu đồ cột) với y = 'count()' -->histogram
[ ]: alt.Chart(…)
[ ]: source
[ ]: 'https://cdn.jsdelivr.net/npm/[email protected]/data/movies.json'
df = pd.DataFrame(data)
df
[ ]: Diem Soluong
0 0 5
1 1 6
2 2 8
3 3 8
4 4 10
5 5 10
6 6 20
7 7 21
8 8 15
9 9 14
10 10 10
[ ]: alt.Chart(…)
7
x = np.arange(100)# 0--->99
source = pd.DataFrame({
'x': x,
'f(x)': np.sin(x / 5) #sin, cos, exp... -->numpy
})
alt.Chart(source).mark_line().encode(
x='x',
y='f(x)'
)
[ ]: alt.Chart(…)
x = range(0, 11)
y = np.sin(x)+np.cos(x)
df = pd.DataFrame({'x':x, 'f(x)':y})
df
alt.Chart(df).mark_line().encode(
x='x',
y='f(x)'
)
[ ]: alt.Chart(…)
[ ]: source.head()
[ ]: x f(x)
0 0 0.000000
1 1 0.198669
2 2 0.389418
3 3 0.564642
4 4 0.717356
source = data.iowa_electricity()
alt.Chart(source).mark_area().encode(
8
x="year:T",
y="net_generation:Q",
color="source:N"
)
# mark_area(): biểu đồ diện tích
[ ]: alt.Chart(…)
[ ]: source
9
36 2003-01-01 Renewables 1885
37 2004-01-01 Renewables 2102
38 2005-01-01 Renewables 2724
39 2006-01-01 Renewables 3364
40 2007-01-01 Renewables 3870
41 2008-01-01 Renewables 5070
42 2009-01-01 Renewables 8560
43 2010-01-01 Renewables 10308
44 2011-01-01 Renewables 11795
45 2012-01-01 Renewables 14949
46 2013-01-01 Renewables 16476
47 2014-01-01 Renewables 17452
48 2015-01-01 Renewables 19091
49 2016-01-01 Renewables 21241
50 2017-01-01 Renewables 21933
[ ]: data = {'Nam':[2020, 2021, 2022, 2023, 2024, 2020, 2021, 2022, 2023, 2024],
'Diem':[5, 6, 6, 8, 9, 5, 5, 7, 6, 5],
'Mon':['T', 'T', 'T', 'T', 'T', 'V', 'V', 'V', 'V', 'V']}
df = pd.DataFrame(data)
df
source = data.cars()
alt.Chart(source).mark_tick().encode(
x='Horsepower:Q',
y='Cylinders:O'
)
10
[ ]: alt.Chart(…)
5 IDMB data
[ ]: !pip install vega_datasets
import pandas as pd
import altair as alt
movies_df = pd.read_json(vega_data.movies.url)
[ ]: movies_df.head(5)
11
Production_Budget Release_Date MPAA_Rating Running_Time_min Distributor \
0 8000000.0 Jun 12 1998 R NaN Gramercy
1 300000.0 Aug 07 1998 R NaN Strand
2 250000.0 Aug 28 1998 None NaN Lionsgate
3 300000.0 Sep 11 1998 None NaN Fine Line
4 1000000.0 Oct 09 1998 R NaN Trimark
[ ]: def extract_year(value):
return pd.to_datetime(value, format='%b %d %Y').year
# chỉ lấy ra year của cột Release_Date --> convert sang datetime
movies_df["Year"] = movies_df["Release_Date"].apply(extract_year)
[ ]: movies_df.columns
[ ]: movies_df.head()
12
[ ]: Title US_Gross Worldwide_Gross US_DVD_Sales \
0 The Land Girls 146083.0 146083.0 NaN
1 First Love, Last Rites 10876.0 10876.0 NaN
2 I Married a Strange Person 203134.0 203134.0 NaN
3 Let's Talk About Sex 373615.0 373615.0 NaN
4 Slam 1009819.0 1087521.0 NaN
[ ]: movies_df["Year"].value_counts()
[ ]: Year
2006 220
2005 210
2002 208
2004 192
2000 188
…
1929 1
2020 1
1946 1
2043 1
1943 1
Name: count, Length: 91, dtype: int64
13
[ ]: (188, 17)
5.1 Chart
[ ]: alt.Chart(movies_2000).mark_point().encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross')
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(movies_2000).mark_point(filled=True).encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross'),
alt.Size('US_Gross')
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(movies_2000).mark_point(filled=True).encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross'),
alt.Size('US_Gross'),
alt.Color('Major_Genre'),
alt.OpacityValue(0.7)
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(movies_2000).mark_point(filled=True).encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross'),
alt.Size('US_Gross'),
alt.Color('Major_Genre'),
alt.OpacityValue(0.7),
tooltip = [alt.Tooltip('Title'),
alt.Tooltip('Production_Budget'),
alt.Tooltip('Worldwide_Gross'),
alt.Tooltip('US_Gross')
]
)
[ ]: alt.Chart(…)
[ ]: alt.Chart(movies_2000).mark_point(filled=True).encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross'),
14
alt.Size('US_Gross'),
alt.Color('Major_Genre'),
alt.OpacityValue(0.7),
tooltip = [alt.Tooltip('Title'),
alt.Tooltip('Production_Budget'),
alt.Tooltip('Worldwide_Gross'),
alt.Tooltip('US_Gross')
]
).interactive()
[ ]: alt.Chart(…)
[ ]: select_year = alt.selection_single(
name='Select', fields=['Year'], init={'Year': 1928},
bind=alt.binding_range(min=1928, max=2046, step=10)
)
alt.Chart(movies_df).mark_point(filled=True).encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross'),
alt.Size('US_Gross'),
alt.Color('Major_Genre'),
alt.OpacityValue(0.7),
tooltip = [alt.Tooltip('Title:N'),
alt.Tooltip('Production_Budget:Q'),
alt.Tooltip('Worldwide_Gross:Q'),
alt.Tooltip('US_Gross:Q')
]
).add_selection(select_year).transform_filter(select_year)
[ ]: alt.Chart(…)
[ ]: select_year = alt.selection_single(
name='Select', fields=['Year'], init={'Year': 1968},
bind=alt.binding_range(min=1968, max=2008, step= 1)
)
alt.Chart(movies_df).mark_point(filled=True).encode(
alt.X('Production_Budget'),
alt.Y('Worldwide_Gross'),
alt.Size('US_Gross'),
alt.Color('Major_Genre'),
alt.OpacityValue(0.7),
tooltip = [alt.Tooltip('Title:N'),
alt.Tooltip('Production_Budget:Q'),
alt.Tooltip('Worldwide_Gross:Q'),
alt.Tooltip('US_Gross:Q')
15
]
).add_selection(select_year).transform_filter(select_year)
[ ]: alt.Chart(…)
16