Only Pandas
Only Pandas
Only Pandas
df = pd.read_csv('../data/gapminder.tsv', sep='\t')
print(type(df))
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.info())
Pandas TypeType Description
object string Most common data type
int64 int Whole numbers
float64 float Numbers with decimals
datetime64 datetime datetime is found in the Python standard library
country_df = df['country']
print(country_df.head())
print(country_df.tail())
subset = df[['country', 'continent', 'year']]
print(subset.head())
Subset method Description
loc Subset based on index label (row name)
iloc Subset based on row index (row number)
ix Subset based on index label or row index
(no longer works in Pandas v0.20)
print(df.loc[0])# get the first row
print(df.loc[99])# get the 100th row
# select the first, 100th, and 1000th rows
print(df.head(n=10))
print(df.groupby('year')['lifeExp'].mean())
flat = multi_group_var.reset_index()
print(df.groupby('continent')['country'].nunique())## to calculate the number of
unique values in a series
s = pd.Series(['banana', 42])
s = pd.Series(['Wes McKinney', 'Creator of Pandas'], index=['Person',
'Who'])
Person Wes McKinney
Who Creator of Pandas
dtype: object
#a=pd.DataFrame(data=dict1,index=list1,columns=list2)
# select by row index label
first_row = scientists.loc['William Gosset']
print(type(first_row))
<class 'pandas.core.series.Series'>
print(first_row.index)
print(first_row.values)
print(first_row.keys())
print(ages.mean())
print(ages.min())
print(ages.std())
print(ages.max())
scientists = pd.read_csv('../data/scientists.csv')
ages = scientists['Age']
print(ages.describe())# get basic stats
print(ages.mean())
print(ages[ages > ages.mean()])
print(ages > ages.mean())
print(scientists['Born'].dtype)
object
import random
# set a seed so the randomness is always the same
random.seed(42)
random.shuffle(scientists['Age'])
# we can convert the value to just the year
scientists['age_years_dt'] = scientists['age_days_dt'].\ astype('timedelta64[Y]')
print(scientists.columns)
# drop the shuffled age column
# you provide the axis=1 argument to drop column-wise
scientists_dropped = scientists.drop(['Age'], axis=1)
print(df1.append(df2))
col_concat = pd.concat([df1, df2, df3], axis=1)#columnwise
pd.concat takes an Iterable as its argument. Hence, it cannot take DataFrames
directly as its argument. Also Dimensions of the DataFrame should match along
axis while concatenating.
pd.merge can take DataFrames as its argument, and is used to combine two
DataFrames with same columns or index, which can't be done with pd.concat since
it will show the repeated column in the DataFrame.
Whereas join can be used to join two DataFrames with different indices.
ebola_dropna = ebola.dropna()
# skipping missing values is True by default
print(ebola.Cases_Guinea.sum(skipna = True))
print(billboard_long[billboard_long.track == 'Loser'].head())
billboard_songs = billboard_songs.drop_duplicates()
# Merge the song dataframe to the original data set
billboard_ratings = billboard_long.merge( billboard_songs, on=['year', 'artist',
'track', 'time'])
# concatenate the dataframes together
taxi = pd.concat([taxi1, taxi2, taxi3, taxi4, taxi5])
#Now that we have a list of dataframes, we can concatenate them.
taxi_loop_concat = pd.concat(list_taxi_df)
#Converting to String Objects
tips['sex_str'] = tips['sex'].astype(str)
#to_numbers
tips_sub_miss['total_bill'] = pd.to_numeric( tips_sub_miss['total_bill'], errors='ignore')
"black Knight".capitalize()
'Black knight'
"It's just a flesh wound!".count('u')
2
"Halt! Who goes there?".startswith('Halt')
True
"coconut".endswith('nut')
True
"It's just a flesh wound!".find('u')
7
"It's just a flesh wound!".index('scratch')
ValueError
"old woman".isalpha()
False (there is a whitespace)
"37".isdecimal()
True
"I'm 37".isalnum()
False (apostrophe and space)
"Black Knight".lower()
'black knight'
"Black Knight".upper()
'BLACK KNIGHT'
"flesh wound!".replace('flesh wound', 'scratch')
'scratch!'
" I'm not dead. ".strip()
"I'm not dead."
"NI! NI! NI! NI!".split(sep=' ')
['NI!', 'NI!', 'NI!', 'NI!']
"3,4.partition(',')
('3', ',', '4')
"nine".center(width=10)
' nine '
"9".zfill(with=5)
'00009'
coords = ' '.join([d1, m1, s1, u1, d2, m2, s2, u2])
multi_str_split = multi_str.splitlines()
#Find a Pattern
m = re.findall(pattern=p, string=s)
def print_me(x):
print(x)
df.apply(print_me, axis=0)
cmis_row = titanic.apply(count_missing, axis=1)
pmis_row = titanic.apply(prop_missing, axis=1)
pcom_row = titanic.apply(prop_complete, axis=1)
print(cmis_row.value_counts())
titanic['num_missing'] = titanic.apply(count_missing, axis=1)
docs['name_lamb'] = docs[0].apply(lambda x: p.match(x).group())
avg_life_exp_by_year = df.groupby('year').lifeExp.mean()
avg_life_exp_by_year = df.groupby('year')['lifeExp'].mean()
# get a list of unique years in the data
years = df.year.unique()
# subset the data for the year 1952
y1952 = df.loc[df.year == 1952, :]
y1952_mean = y1952.lifeExp.mean()
cont_le_agg2 = df.groupby('continent').lifeExp.aggregate(np.mean)
print(tips_filtered['size'].value_counts())
# list all the columns
print(tips_10.columns)