Python Quick Note
Python Quick Note
In [2]: a.bit_length()
Out[2]: 4
In [3]: a = 100000
a.bit_length()
Out[3]: 17
In [6]: 1 + 4
Out[6]: 5
In [7]: 1 / 4
Out[7]: 0.25
In [8]: type(1 / 4)
Out[8]: float
Floats
In [9]: 1.6 / 4
Out[9]: 0.4
In [11]: b = 0.35
type(b)
Out[11]: float
In [12]: b + 0.1
Out[12]: 0.44999999999999996
In [13]: c = 0.5
c.as_integer_ratio()
Out[13]: (1, 2)
In [14]: b.as_integer_ratio()
Out[14]: (3152519739159347, 9007199254740992)
In [18]: decimal.getcontext().prec = 4
In [19]: e = Decimal(1) / Decimal (11)
e
Out[19]: Decimal('0.09091')
In [20]: decimal.getcontext().prec = 50
In [21]: f = Decimal(1) / Decimal (11)
f
Out[21]: Decimal('0.090909090909090909090909090909090909090909090909091')
In [22]: g = d + e + f
g
Out[22]: Decimal('0.27272818181818181818181818181909090909090909090909'
Booleans
In [23]: import keyword
In [24]: keyword.kwlist
Out[24]: ['False', 'None', 'True', 'and', 'as', 'assert', 'async', 'await',
'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except',
'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda',
'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while',
'with', 'yield']
In [25]: 4 > 3
Out[25]: True
In [26]: type(4 > 3)
Out[26]: bool
In [27]: type(False)
Out[27]: bool
In [28]: 4 >= 3
Out[28]: True
In [29]: 4 < 3
Out[29]: False
In [30]: 4 <= 3
Out[30]: False
In [31]: 4 == 3
Out[31]: False
In [32]: 4 != 3
Out[32]: True
In [45]: if 4 > 3:
print('condition true')
condition true
In [46]: i = 0
while i < 4:
print('condition true, i = ', i)
i += 1
condition true, i = 0
condition true, i = 1
condition true, i = 2
condition true, i = 3
In [47]: int(True)
Out[47]: 1
In [48]: int(False)
Out[48]: 0
In [49]: float(True)
Out[49]: 1.0
In [50]: float(False)
Out[50]: 0.0
In [51]: bool(0)
Out[51]: False
In [52]: bool(0.0)
Out[52]: False
In [53]: bool(1)
Out[53]: True
In [54]: bool(10.5)
Out[54]: True
In [55]: bool(-2)
Out[55]: True
Strings
In [56]: t = 'this is a string object'
In [57]: t.capitalize()
Out[57]: 'This is a string object'
In [58]: t.split()
Out[58]: ['this', 'is', 'a', 'string', 'object']
In [59]: t.find('string')
Out[59]: 10
In [60]: t.find('Python')
Out[60]: -1
In [61]: t.replace(' ', '|')
Out[61]: 'this|is|a|string|object'
In [62]: 'http://www.python.org'.strip('http:/')
Out[62]: 'www.python.org'
In [87]: i = 0
while i < 4:
print('the number is %d' % i)
i += 1
the number is 0
the number is 1
the number is 2
the number is 3
In [88]: i = 0
while i < 4:
print('the number is {:d}'.format(i))
i += 1
the number is 0
the number is 1
the number is 2
the number is 3
In [90]: series = """ '01/18/2014 13:00:00', 100, '1st'; '01/18/2014 13:30:00', 110, '2nd';
'01/18/2014 14:00:00', 120, '3rd' """
In [91]: dt = re.compile("'[0-9/:\s]+'") # datetime
In [92]: result = dt.findall(series)
result
Out[92]: ["'01/18/2014 13:00:00'", "'01/18/2014 13:30:00'", "'01/18/2014 14:00:00'"]
In [93]: from datetime import datetime
pydt = datetime.strptime(result[0].replace("'", ""), '%m/%d/%Y %H:%M:
%S')
pydt
Out[93]: datetime.datetime(2014, 1, 18, 13, 0)
In [94]: print(pydt)
2014-01-18 13:00:00
In [95]: print(type(pydt))
<class 'datetime.datetime'>
In [98]: t[2]
Out[98]: 'data'
In [99]: type(t[2])
Out[99]: str
In [100]: t.count('data')
Out[100]: 1
In [101]: t.index(1)
Out[101]: 0
Lists
In [102]: l = [1, 2.5, 'data']
l[2]
Out[102]: 'data'
In [103]: l = list(t)
l
Out[103]: [1, 2.5, 'data']
In [104]: type(l)
Out[104]: list
Control Structures
In [111]: for element in l[2:5]:
print(element ** 2)
6.25 1.0 2.25
In [112]: r = range(0, 8, 1)
r
Out[112]: range(0, 8)
In [113]: type(r)
Out[113]: range
In [114]: for i in range(2, 5):
print(l[i] ** 2)
6.25 1.0 2.25
In [115]: for i in range(1, 10):
if i % 2 == 0:
print("%d is even" % i)
elif i % 3 == 0:
print("%d is multiple of 3" % i)
else:
print("%d is odd" % i)
1 is odd 2 is even 3 is multiple of 3 4 is even 5 is odd 6 is even 7 is odd
8 is even 9 is multiple of 3
In [116]: total = 0
while total < 100:
total += 1
print(total)
100
In [117]: m = [i ** 2 for i in range(5)]
m
Out[117]: [0, 1, 4, 9, 16]
Functional Programming
In [118]: def f(x):
return x ** 2
f(2)
Out[118]: 4
In [119]: def even(x):
return x % 2 == 0
even(3)
Out[119]: False
In [120]: list(map(even, range(10)))
Out[120]: [True, False, True, False, True, False, True, False, True, False]
In [121]: list(map(lambda x: x ** 2, range(10)))
Out[121]: [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
In [122]: list(filter(even, range(15)))
Out[122]: [0, 2, 4, 6, 8, 10, 12, 14]
Dicts
In [123]: d = { 'Name' : 'Angela Merkel', 'Country' : 'Germany', 'Profession' :
'Chancelor', 'Age' : 64 }
type(d)
Out[123]: dict
In [124]: print(d['Name'], d['Age'])
Angela Merkel 64
In [125]: d.keys()
Out[125]: dict_keys(['Name', 'Country', 'Profession', 'Age'])
In [126]: d.values()
Out[126]: dict_values(['Angela Merkel', 'Germany', 'Chancelor', 64])
In [127]: d.items()
Out[127]: dict_items([('Name', 'Angela Merkel'), ('Country', 'Germany'), ('Profession',
'Chancelor'), ('Age', 64)])
In [128]: birthday = True
if birthday:
d['Age'] += 1
print(d['Age'])
65
In [129]: for item in d.items():
print(item)
('Name', 'Angela Merkel') ('Country', 'Germany') ('Profession', 'Chancelor')
('Age', 65)
In [130]: for value in d.values():
print(type(value))
<class 'str'> <class 'str'> <class 'str'> <class 'int'>
Method Arguments Returns/result
d[k] [k] Item of d with key k
d[k] = x [k] Sets item key k to x
del d[k] [k] Deletes item with key k
clear () Removes all items
copy () Makes a copy
has_key (k) True if k is a key
items () Iterator over all items
keys () Iterator over all keys
values () Iterator over all values
popitem (k) Returns and removes item with key k
update ([e]) Updates items with items from e
Sets
In [131]: s = set(['u', 'd', 'ud', 'du', 'd', 'du'])
s
Out[131]: {'d', 'du', 'u', 'ud'}
In [132]: t = set(['d', 'dd', 'uu', 'u'])
In [133]: s.union(t)
Out[133]: {'d', 'dd', 'du', 'u', 'ud', 'uu'}
In [134]: s.intersection(t)
Out[134]: {'d', 'u'}
In [135]: s.difference(t)
Out[135]: {'du', 'ud'}
In [136]: t.difference(s)
Out[136]: {'dd', 'uu'}
In [137]: s.symmetric_difference(t)
Out[137]: {'dd', 'du', 'ud', 'uu'}
In [138]: from random import randint
l = [randint(0, 10) for i in range(1000)]
len(l)
Out[138]: 1000
In [139]: l[:20]
Out[139]: [4, 2, 10, 2, 1, 10, 0, 6, 0, 8, 10, 9, 2, 4, 7, 8, 10, 8, 8, 2]
In [140]: s = set(l)
s
Out[140]: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
In [68]: g.size
Out[68]: 12
In [69]: g.itemsize
Out[69]: 8
In [70]: g.ndim
Out[70]: 1
In [71]: g.shape
Out[71]: (12,)
In [72]: g.dtype
Out[72]: dtype('float64')
In [73]: g.nbytes
Out[73]: 96
In [74]: g = np.arange(15)
In [75]: g
Out[75]: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
In [76]: g.shape
Out[76]: (15,)
In [77]: np.shape(g)
Out[77]: (15,)
In [78]: g.reshape((3, 5))
Out[78]: array([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]])
In [79]: h = g.reshape((5, 3))
h
Out[79]: array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11], [12, 13,
14]])
In [80]: h.T
Out[80]: array([[ 0, 3, 6, 9, 12], [ 1, 4, 7, 10, 13], [ 2, 5, 8, 11, 14]])
In [81]: h.transpose()
Out[81]: array([[ 0, 3, 6, 9, 12], [ 1, 4, 7, 10, 13], [ 2, 5, 8, 11, 14]])
In [82]: g
Out[82]: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
In [83]: np.resize(g, (3, 1))
Out[83]: array([[0], [1], [2]])
In [84]: np.resize(g, (1, 5))
Out[84]: array([[0, 1, 2, 3, 4]])
In [85]: np.resize(g, (2, 5))
Out[85]: array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
In [86]: n = np.resize(g, (5, 4))
n
Out[86]: array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11], [12, 13, 14, 0],
[ 1, 2, 3, 4]])
In [87]: h
Out[87]: array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11], [12, 13,
14]])
In [88]: np.hstack((h, 2 * h))
Out[88]: array([[ 0, 1, 2, 0, 2, 4], [ 3, 4, 5, 6, 8, 10], [ 6, 7, 8, 12, 14, 16],
[ 9, 10, 11, 18, 20, 22], [12, 13, 14, 24, 26, 28]])
In [89]: np.vstack((h, 0.5 * h))
Out[89]: array([[ 0. , 1. , 2. ], [ 3. , 4. , 5. ], [ 6. , 7. , 8. ], [ 9. , 10. , 11. ],
[12. , 13. , 14. ], [ 0. , 0.5, 1. ], [ 1.5, 2. , 2.5], [ 3. , 3.5, 4. ], [ 4.5,
5. , 5.5], [ 6. , 6.5, 7. ]])
In [90]: h
Out[90]: array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11], [12, 13,
14]])
In [91]: h.flatten()
Out[91]: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
In [92]: h.flatten(order='C')
Out[92]: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
In [93]: h.flatten(order='F')
Out[93]: array([ 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14])
In [94]: for i in h.flat:
print(i, end=',')
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
In [95]: for i in h.ravel(order='C'):
print(i, end=',')
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,In [96]: for i in h.ravel(order='F'): print(i, end=',')
0,3,6,9,12,1,4,7,10,13,2,5,8,11,14,
Boolean Arrays
In [97]: h
Out[97]: array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11], [12, 13,
14]])
In [98]: h > 8
Out[98]: array([[False, False, False], [False, False, False], [False, False, False],
[ True, True, True], [ True, True, True]])
In [99]: h <= 7
Out[99]: array([[ True, True, True], [ True, True, True], [ True, True, False],
[False, False, False], [False, False, False]])
In [100]: h == 5
Out[100]: array([[False, False, False], [False, False, True], [False, False, False],
[False, False, False], [False, False, False]])
In [101]: (h == 5).astype(int)
Out[101]: array([[0, 0, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]])
In [102]: (h > 4) & (h <= 12)
Out[102]: array([[False, False, False], [False, False, True], [ True, True, True],
[ True, True, True], [ True, False, False]])
In [103]: h[h > 8]
Out[103]: array([ 9, 10, 11, 12, 13, 14])
In [104]: h[(h > 4) & (h <= 12)]
Out[104]: array([ 5, 6, 7, 8, 9, 10, 11, 12])
In [105]: h[(h < 4) | (h >= 12)]
Out[105]: array([ 0, 1, 2, 3, 12, 13, 14])
In [106]: np.where(h > 7, 1, 0)
Out[106]: array([[0, 0, 0], [0, 0, 0], [0, 0, 1], [1, 1, 1], [1, 1, 1]])
In [107]: np.where(h % 2 == 0, 'even', 'odd')
Out[107]: array([['even', 'odd', 'even'], ['odd', 'even', 'odd'], ['even', 'odd', 'even'],
['odd', 'even', 'odd'], ['even', 'odd', 'even']], dtype='<U4')
In [108]: np.where(h <= 7, h * 2, h / 2)
Out[108]: array([[ 0. , 2. , 4. ], [ 6. , 8. , 10. ], [12. , 14. , 4. ], [ 4.5, 5. ,
5.5], [ 6. , 6.5, 7. ]])
Speed Comparison
In [109]: import random I = 5000
In [110]: %time mat = [[random.gauss(0, 1) for j in range(I)] \ for i in range(I)]
CPU times: user 17.1 s, sys: 361 ms, total: 17.4 s
Wall time: 17.4 s
In [111]: mat[0][:5]
Out[111]: [-0.40594967782329183, -1.357757478015285, 0.05129566894355976,
-0.8958429976582192, 0.6234174778878331]
In [112]: %time sum([sum(l) for l in mat])
CPU times: user 142 ms, sys: 1.69 ms, total: 144 ms
Wall time: 143 ms
Out[112]: -3561.944965714259
In [113]: import sys
sum([sys.getsizeof(l) for l in mat])
Out[113]: 215200000
In [114]: %time mat = np.random.standard_normal((I, I))
CPU times: user 1.01 s, sys: 200 ms, total: 1.21 s
Wall time: 1.21 s
In [115]: %time mat.sum()
CPU times: user 29.7 ms, sys: 1.15 ms, total: 30.8 ms
Wall time: 29.4 ms
Out[115]: -186.12767026606448
In [116]: mat.nbytes
Out[116]: 200000000
In [117]: sys.getsizeof(mat)
Out[117]: 200000112
Structured NumPy Arrays
In [118]: dt = np.dtype([('Name', 'S10'), ('Age', 'i4'), ('Height', 'f'), ('Children/Pets', 'i4',
2)])
In [119]: dt
Out[119]: dtype([('Name', 'S10'), ('Age', '<i4'), ('Height', '<f4'), ('Children/Pets', '<i4', (2,))])
In [120]: dt = np.dtype({'names': ['Name', 'Age', 'Height', 'Children/Pets'], 'formats':'O int
float int,int'.split()})
In [121]: dt
Out[121]: dtype([('Name', 'O'), ('Age', '<i8'), ('Height', '<f8'), ('Children/Pets', [('f0', '<i8'), ('f1',
'<i8')])])
In [122]: s = np.array([('Smith', 45, 1.83, (0, 1)), ('Jones', 53, 1.72, (2, 2))], dtype=dt)
In [123]: s
Out[123]: array([('Smith', 45, 1.83, (0, 1)), ('Jones', 53, 1.72, (2, 2))], dtype=[('Name', 'O'), ('Age',
'<i8'), ('Height', '<f8'), ('Children/Pets', [('f0', '<i8'), ('f1', '<i8')])])
In [124]: type(s)
Out[124]: numpy.ndarray
In [125]: s['Name']
Out[125]: array(['Smith', 'Jones'], dtype=object)
In [126]: s['Height'].mean()
Out[126]: 1.775
In [127]: s[0]
Out[127]: ('Smith', 45, 1.83, (0, 1))
In [128]: s[1]['Age']
Out[128]: 53
Basic Vectorization
In [129]: np.random.seed(100)
r = np.arange(12).reshape((4, 3))
s = np.arange(12).reshape((4, 3)) * 0.5
In [130]: r
Out[130]: array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11]])
In [131]: s
Out[131]: array([[0. , 0.5, 1. ], [1.5, 2. , 2.5], [3. , 3.5, 4. ], [4.5, 5. , 5.5]])
In [132]: r + s
Out[132]: array([[ 0. , 1.5, 3. ], [ 4.5, 6. , 7.5], [ 9. , 10.5, 12. ], [13.5, 15. ,
16.5]])
In [133]: r + 3
Out[133]: array([[ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11], [12, 13, 14]])
In [134]: 2 * r
Out[134]: array([[ 0, 2, 4], [ 6, 8, 10], [12, 14, 16], [18, 20, 22]])
In [135]: 2 * r + 3
Out[135]: array([[ 3, 5, 7], [ 9, 11, 13], [15, 17, 19], [21, 23, 25]])
In [136]: r
Out[136]: array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11]])
In [137]: r.shape
Out[137]: (4, 3)
In [138]: s = np.arange(0, 12, 4) s
Out[138]: array([0, 4, 8])
In [139]: r + s
Out[139]: array([[ 0, 5, 10], [ 3, 8, 13], [ 6, 11, 16], [ 9, 14, 19]])
In [140]: s = np.arange(0, 12, 3)
s
Out[140]: array([0, 3, 6, 9])
In [141]: r + s
ValueError: operands could not be broadcast together with shapes (4,3) (4,)
In [142]: r.transpose() + s
Out[142]: array([[ 0, 6, 12, 18], [ 1, 7, 13, 19], [ 2, 8, 14, 20]])
In [143]: sr = s.reshape(-1, 1)
sr
Out[143]: array([[0], [3], [6], [9]])
In [144]: sr.shape
Out[144]: (4, 1)
In [145]: r + s.reshape(-1, 1)
Out[145]: array([[ 0, 1, 2], [ 6, 7, 8], [12, 13, 14], [18, 19, 20]])
In [146]: def f(x):
return 3 * x + 5
In [147]: f(0.5)
Out[147]: 6.5
In [148]: f(r)
Out[148]: array([[ 5, 8, 11], [14, 17, 20], [23, 26, 29], [32, 35, 38]])
Memory Layout
In [149]: x = np.random.standard_normal((1000000, 5))
In [150]: y = 2 * x + 3
In [151]: C = np.array((x, y), order='C')
In [152]: F = np.array((x, y), order='F')
In [153]: x = 0.0; y = 0.0
In [154]: C[:2].round(2)
Out[154]: array([[[-1.75, 0.34, 1.15, -0.25, 0.98], [ 0.51, 0.22, -1.07, -0.19, 0.26],
[-0.46, 0.44, -0.58, 0.82, 0.67], ..., [-0.05, 0.14, 0.17, 0.33, 1.39],
[ 1.02, 0.3 , -1.23, -0.68, -0.87], [ 0.83, -0.73, 1.03, 0.34, -0.46]], [[-0.5 , 3.69,
5.31, 2.5 , 4.96], [ 4.03, 3.44, 0.86, 2.62, 3.51], [ 2.08, 3.87, 1.83, 4.63, 4.35],
..., [ 2.9 , 3.28, 3.33, 3.67, 5.78], [ 5.04, 3.6 , 0.54, 1.65, 1.26], [ 4.67,
1.54, 5.06, 3.69, 2.07]]])
In [155]: %timeit C.sum()
4.36 ms ± 89.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [156]: %timeit F.sum()
4.21 ms ± 71.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [157]: %timeit C.sum(axis=0)
17.9 ms ± 776 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [158]: %timeit C.sum(axis=1)
35.1 ms ± 999 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [159]: %timeit F.sum(axis=0)
83.8 ms ± 2.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [160]: %timeit F.sum(axis=1)
67.9 ms ± 5.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [161]: F = 0.0; C = 0.0
Data Analysis with pandas
Object type Meaning Used for
DataFrame 2-dimensional data object with index Tabular data organized in
columns
Series 1-dimensional data object with index Single (time) series of data
DataFrame Class
In [1]: import pandas as pd
In [2]: df = pd.DataFrame([10, 20, 30, 40], columns=['numbers'],
index=['a', 'b', 'c', 'd'])
In [3]: df
Out[3]: numbers a 10 b 20 c 30 d 40
In [4]: df.index
Out[4]: Index(['a', 'b', 'c', 'd'], dtype='object')
In [5]: df.columns
Out[5]: Index(['numbers'], dtype='object')
In [6]: df.loc['c']
Out[6]: numbers 30 Name: c, dtype: int64
In [7]: df.loc[['a', 'd']]
Out[7]: numbers a 10 d 40
In [8]: df.iloc[1:3]
Out[8]: numbers b 20 c 30
In [9]: df.sum()
Out[9]: numbers 100 dtype: int64
In [10]: df.apply(lambda x: x ** 2)
Out[10]: numbers a 100 b 400 c 900 d 1600
In [11]: df ** 2
Out[11]: numbers a 100 b 400 c 900 d 1600
In [12]: df['floats'] = (1.5, 2.5, 3.5, 4.5)
In [13]: df
Out[13]: numbers floats a 10 1.5 b 20 2.5 c 30 3.5 d 40 4.5
In [14]: df['floats']
Out[14]: a 1.5 b 2.5 c 3.5 d 4.5 Name: floats, dtype: float64
In [15]: df['names'] = pd.DataFrame(['Yves', 'Sandra', 'Lilli', 'Henry'], index=['d', 'a',
'b', 'c'])
In [16]: df
Out[16]: numbers floats names a 10 1.5 Sandra b 20 2.5 Lilli c 30
3.5 Henry d 40 4.5 Yves
In [17]: df.append({'numbers': 100, 'floats': 5.75, 'names': 'Jil'}, ignore_index=True)
Out[17]: numbers floats names 0 10 1.50 Sandra 1 20 2.50 Lilli 2 30
3.50 Henry 3 40 4.50 Yves 4 100 5.75 Jil
In [18]: df = df.append(pd.DataFrame({'numbers': 100, 'floats': 5.75, 'names':
'Jil'}, index=['y',]))
In [19]: df
Out[19]: numbers floats names a 10 1.50 Sandra b 20 2.50 Lilli c 30
3.50 Henry d 40 4.50 Yves y 100 5.75 Jil
In [20]: df = df.append(pd.DataFrame({'names': 'Liz'}, index=['z',]), sort=False) In [21]: df
Out[21]: numbers floats names a 10.0 1.50 Sandra b 20.0 2.50 Lilli c 30.0
3.50 Henry d 40.0 4.50 Yves y 100.0 5.75 Jil z NaN NaN Liz
In [22]: df.dtypes
Out[22]: numbers float64 floats float64 names object dtype: object
In [23]: df[['numbers', 'floats']].mean()
Out[23]: numbers 40.00 floats 3.55 dtype: float64
In [24]: df[['numbers', 'floats']].std()
Out[24]: numbers 35.355339 floats 1.662077 dtype: float64
In [25]: import numpy as np
In [26]: np.random.seed(100)
In [27]: a = np.random.standard_normal((9, 4))
In [28]: a
Out[28]: array([[-1.74976547, 0.3426804 , 1.1530358 , -0.25243604], [ 0.98132079,
0.51421884, 0.22117967, -1.07004333], [-0.18949583, 0.25500144, -0.45802699,
0.43516349], [-0.58359505, 0.81684707, 0.67272081, -0.10441114], [-0.53128038,
1.02973269, -0.43813562, -1.11831825], [ 1.61898166, 1.54160517, -0.25187914,
-0.84243574], [ 0.18451869, 0.9370822 , 0.73100034, 1.36155613], [-0.32623806,
0.05567601, 0.22239961, -1.443217 ], [-0.75635231, 0.81645401, 0.75044476,
-0.45594693]])
In [29]: df = pd.DataFrame(a)
Parameters of DataFrame() function
Parameter Format Description
data ndarray/dict/DataFrame Data for DataFrame; dict can contain Series,
ndarray, list
index Index/array-like Index to use; defaults to range(n)
columns Index/array-like Column headers to use; defaults to range(n)
dtype dtype, default None Data type to use/force; otherwise, it is inferred
copy bool, default None Copy data from inputs
In [31]: df.columns = ['No1', 'No2', 'No3', 'No4']
In [33]: df['No2'].mean()
Out[33]: 0.7010330941456459
In [34]: dates = pd.date_range('2019-1-1', periods=9, freq='M')
In [35]: dates
Out[35]: DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30', '2019-
05-31', '2019-06-30', '2019-07-31', '2019-08-31', '2019-09-30'],
dtype='datetime64[ns]', freq='M'
Parameters of date_range() function
In [56]: type(df)
Out[56]: pandas.core.frame.DataFrame
In [57]: S = pd.Series(np.linspace(0, 15, 7), name='series')
In [58]: S
Out[58]: 0 0.0 1 2.5 2 5.0 3 7.5 4 10.0 5 12.5 6 15.0
Name: series, dtype: float64
In [59]: type(S)
Out[59]: pandas.core.series.Series
In [60]: s = df['No1']
In [61]: s
Out[61]: 2019-01-31 -1.749765 2019-02-28 0.981321 2019-03-31 -0.189496 2019-
04-30 -0.583595 2019-05-31 -0.531280 2019-06-30 1.618982 2019-07-31
0.184519 2019-08-31 -0.326238 2019-09-30 -0.756352 Freq: M, Name: No1, dtype:
float64
In [62]: type(s)
Out[62]: pandas.core.series.Series
In [63]: s.mean()
Out[63]: -0.15021177307319458
In [65]: df['Quarter'] = ['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3']
In [66]: groups = df.groupby('Quarter')
In [67]: groups.size()
Out[67]: Quarter Q1 3 Q2 3 Q3 3 dtype: int64
In [68]: groups.mean()
Out[68]: No1 No2 No3 No4 Quarter Q1 -0.319314 0.370634 0.305396
-0.295772 Q2 0.168035 1.129395 -0.005765 -0.688388 Q3 -0.299357 0.603071
0.567948 -0.179203
In [69]: groups.max()
Out[69]: No1 No2 No3 No4 Quarter Q1 0.981321 0.514219 1.153036
0.435163 Q2 1.618982 1.541605 0.672721 -0.104411 Q3 0.184519 0.937082
0.750445 1.361556
In [70]: groups.aggregate([min, max]).round(2)
Out[70]: No1 No2 No3 No4 min max min max min max min max
Quarter Q1 -1.75 0.98 0.26 0.51 -0.46 1.15 -1.07 0.44 Q2 -0.58 1.62 0.82 1.54
-0.44 0.67 -1.12 -0.10 Q3 -0.76 0.18 0.06 0.94 0.22 0.75 -1.44 1.36
In [71]: df['Odd_Even'] = ['Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even', 'Odd']
In [72]: groups = df.groupby(['Quarter', 'Odd_Even'])
In [73]: groups.size()
Out[73]: Quarter Odd_Even Q1 Even 1 Odd 2 Q2 Even 2
Odd 1 Q3 Even 1 Odd 2 dtype: int64
In [74]: groups[['No1', 'No4']].aggregate([sum, np.mean])
Complex Selection
In [90]: df1 = pd.DataFrame(['100', '200', '300', '400'], index=['a', 'b', 'c', 'd'],
columns=['A',])
In [91]: df1
Out[91]: A a 100 b 200 c 300 d 400
In [92]: df2 = pd.DataFrame(['200', '150', '50'], index=['f', 'b', 'd'],
columns=['B',])
In [93]: df2
Out[93]: B f 200 b 150 d 50
In [94]: df1.append(df2, sort=False)
Out[94]: A B a 100 NaN b 200 NaN c 300 NaN d 400 NaN f NaN 200
b NaN 150 d NaN 50
In [95]: df1.append(df2, ignore_index=True, sort=False)
Out[95]: A B 0 100 NaN 1 200 NaN 2 300 NaN 3 400 NaN 4 NaN 200
5 NaN 150 6 NaN 50
In [96]: pd.concat((df1, df2), sort=False)
Out[96]: A B a 100 NaN b 200 NaN c 300 NaN d 400 NaN f NaN 200
b NaN 150 d NaN 50
In [97]: pd.concat((df1, df2), ignore_index=True, sort=False)
Out[97]: A B 0 100 NaN 1 200 NaN 2 300 NaN 3 400 NaN 4 NaN 200
5 NaN 150 6 NaN 50
In [98]: df1.join(df2)
Out[98]: A B a 100 NaN b 200 150 c 300 NaN d 400 50
In [99]: df2.join(df1)
Out[99]: B A f 200 NaN b 150 200 d 50 400
In [100]: df1.join(df2, how='left')
In [101]: df1.join(df2, how='right')
In [102]: df1.join(df2, how='inner')
In [103]: df1.join(df2, how='outer')
In [104]: df = pd.DataFrame()
In [105]: df['A'] = df1['A']
In [106]: df
Out[106]: A a 100 b 200 c 300 d 400
In [107]: df['B'] = df2
In [108]: df
Out[108]: A B a 100 NaN b 200 150 c 300 NaN d 400 50
In [109]: df = pd.DataFrame({'A': df1['A'], 'B': df2['B']})
In [110]: df
Out[110]: A B a 100 NaN b 200 150 c 300 NaN d 400 50 f NaN
200
In [111]: c = pd.Series([250, 150, 50], index=['b', 'd', 'c']) df1['C'] = c df2['C'] = c
In [112]: df1
Out[112]: A C a 100 NaN b 200 250.0 c 300 50.0 d 400 150.0
In [113]: df2
Out[113]: B C f 200 NaN b 150 250.0 d 50 150.0
In [114]: pd.merge(df1, df2)
Out[114]: A C B 0 100 NaN 200 1 200 250.0 150 2 400 150.0 50
In [115]: pd.merge(df1, df2, on='C')
Out[115]: A C B 0 100 NaN 200 1 200 250.0 150 2 400 150.0 50
In [116]: pd.merge(df1, df2, how='outer')
Out[116]: A C B 0 100 NaN 200 1 200 250.0 150 2 300 50.0 NaN 3
400 150.0 50
In [117]: pd.merge(df1, df2, left_on='A', right_on='B')
In [118]: pd.merge(df1, df2, left_on='A', right_on='B', how='outer')
In [119]: pd.merge(df1, df2, left_index=True, right_index=True)
In [120]: pd.merge(df1, df2, on='C', left_index=True)
In [121]: pd.merge(df1, df2, on='C', right_index=True)
In [122]: pd.merge(df1, df2, on='C', left_index=True, right_index=True)
In [123]: data = np.random.standard_normal((1000000, 2))
In [124]: data.nbytes
Out[124]: 16000000
In [125]: df = pd.DataFrame(data, columns=['x', 'y'])
In [126]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
x 1000000 non-null float64
y 1000000 non-null float64
dtypes: float64(2)
memory usage: 15.3 MB
Classes
int
In [7]: n = 5
In [8]: type(n)
Out[8]: int
In [9]: n.numerator
Out[9]: 5
In [10]: n.bit_length()
Out[10]: 3
In [11]: n + n
Out[11]: 10
In [12]: 2 * n
Out[12]: 10
In [13]: n.__sizeof__()
Out[13]: 28
Class Example:
class Vector(object):
def __init__(self, x=0, y=0, z=0):
self.x = x
self.y = y
self.z = z
def __repr__(self):
return 'Vector(%r, %r, %r)' % (self.x, self.y, self.z)
def __abs__(self):
return (self.x ** 2 + self.y ** 2 + self.z ** 2) ** 0.5
def __bool__(self):
return bool(abs(self))
def __add__(self, other):
x = self.x + other.x
y = self.y + other.y
z = self.z + other.z
return Vector(x, y, z)
def __mul__(self, scalar):
return Vector(self.x * scalar, self.y * scalar,self.z * scalar)
def __len__(self):
return 3
def __getitem__(self, i):
if i in [0, -3]:
return self.x
elif i in [1, -2]:
return self.y
elif i in [2, -1]:
return self.z
else:
raise IndexError('Index out of range.')
def __iter__(self):
for i in range(len(self)):
yield self[i]