Only Pandas

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 8

import pandas as pd

df = pd.read_csv('../data/gapminder.tsv', sep='\t')
print(type(df))
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.info())
Pandas TypeType Description
object string Most common data type
int64 int Whole numbers
float64 float Numbers with decimals
datetime64 datetime datetime is found in the Python standard library

country_df = df['country']
print(country_df.head())
print(country_df.tail())
subset = df[['country', 'continent', 'year']]
print(subset.head())
Subset method Description
loc Subset based on index label (row name)
iloc Subset based on row index (row number)
ix Subset based on index label or row index
(no longer works in Pandas v0.20)
print(df.loc[0])# get the first row
print(df.loc[99])# get the 100th row
# select the first, 100th, and 1000th rows
print(df.head(n=10))
print(df.groupby('year')['lifeExp'].mean())
flat = multi_group_var.reset_index()
print(df.groupby('continent')['country'].nunique())## to calculate the number of
unique values in a series
s = pd.Series(['banana', 42])
s = pd.Series(['Wes McKinney', 'Creator of Pandas'],              index=['Person',
'Who'])
Person         Wes McKinney
Who        Creator of Pandas
dtype: object
#a=pd.DataFrame(data=dict1,index=list1,columns=list2)
# select by row index label
first_row = scientists.loc['William Gosset']
print(type(first_row))
<class 'pandas.core.series.Series'>

print(first_row.index)
print(first_row.values)

print(first_row.keys())
print(ages.mean())
print(ages.min())
print(ages.std())
print(ages.max())

scientists = pd.read_csv('../data/scientists.csv')
ages = scientists['Age']
print(ages.describe())# get basic stats
print(ages.mean())
print(ages[ages > ages.mean()])
print(ages > ages.mean())

print(scientists['Born'].dtype)
object

import random
# set a seed so the randomness is always the same
random.seed(42)
random.shuffle(scientists['Age'])
# we can convert the value to just the year
scientists['age_years_dt'] = scientists['age_days_dt'].\    astype('timedelta64[Y]')
print(scientists.columns)
# drop the shuffled age column
# you provide the axis=1 argument to drop column-wise
scientists_dropped = scientists.drop(['Age'], axis=1)

#One of the (conceptually) easier ways to combine data is with concatenation


row_concat = pd.concat([df1, df2, df3])

print(df1.append(df2))
col_concat = pd.concat([df1, df2, df3], axis=1)#columnwise
pd.concat takes an Iterable as its argument. Hence, it cannot take DataFrames
directly as its argument. Also Dimensions of the DataFrame should match along
axis while concatenating.

pd.merge can take DataFrames as its argument, and is used to combine two
DataFrames with same columns or index, which can't be done with pd.concat since
it will show the repeated column in the DataFrame.

Whereas join can be used to join two DataFrames with different indices.

# the default value for 'how' is 'inner'# so it doesn't need to be specifiedo2o_merge


= site.merge(visited_subset,                           left_on='name', right_on='site')
from numpy import NaN, NAN, nan
print(NaN == True)
False
print(NaN == False)
False
print(NaN == 0)
False
print(pd.read_csv(visited_file, keep_default_na=False))
scientists['missing'] = nan

# count the number of non-missing values


print(ebola.count())
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
# get the first 5 value counts from the Cases_Guinea column
print(ebola.Cases_Guinea.value_counts(dropna=False).head())
print(ebola.fillna(0).iloc[0:10, 0:5])
#missing values are replaced with the last known/recorded value.
print(ebola.fillna(method='ffill').iloc[0:10, 0:5])
#the interpolation in Pandas fills in missing values linearly. Specifically, it treats
the missing values as if they should be equally spaced apart.
print(ebola.interpolate().iloc[0:10, 0:5])

ebola_dropna = ebola.dropna()
# skipping missing values is True by default
print(ebola.Cases_Guinea.sum(skipna = True))

print(billboard_long[billboard_long.track == 'Loser'].head())
billboard_songs = billboard_songs.drop_duplicates()
# Merge the song dataframe to the original data set
billboard_ratings = billboard_long.merge(     billboard_songs, on=['year', 'artist',
'track', 'time'])
# concatenate the dataframes together
taxi = pd.concat([taxi1, taxi2, taxi3, taxi4, taxi5])
#Now that we have a list of dataframes, we can concatenate them.
taxi_loop_concat = pd.concat(list_taxi_df)
#Converting to String Objects
tips['sex_str'] = tips['sex'].astype(str)
#to_numbers
tips_sub_miss['total_bill'] = pd.to_numeric(   tips_sub_miss['total_bill'], errors='ignore')
"black Knight".capitalize()
'Black knight'
"It's just a flesh wound!".count('u')
2
"Halt! Who goes there?".startswith('Halt')
True
"coconut".endswith('nut')
True
"It's just a flesh wound!".find('u')
7
"It's just a flesh wound!".index('scratch')
ValueError
"old woman".isalpha()
False (there is a whitespace)
"37".isdecimal()
True
"I'm 37".isalnum()
False (apostrophe and space)
"Black Knight".lower()
'black knight'
"Black Knight".upper()
'BLACK KNIGHT'
"flesh wound!".replace('flesh wound', 'scratch')
'scratch!'
" I'm not dead. ".strip()
"I'm not dead."
"NI! NI! NI! NI!".split(sep=' ')
['NI!', 'NI!', 'NI!', 'NI!']
"3,4.partition(',')
('3', ',', '4')
"nine".center(width=10)
' nine '
"9".zfill(with=5)
'00009'
coords = ' '.join([d1, m1, s1, u1, d2, m2, s2, u2])
multi_str_split = multi_str.splitlines()

var = 'flesh wound'


s = "It's just a {}!"
print(s.format(var))
It's just a flesh wound!

#Find a Pattern
m = re.findall(pattern=p, string=s)

def print_me(x):    
print(x)
df.apply(print_me, axis=0)
cmis_row = titanic.apply(count_missing, axis=1)
pmis_row = titanic.apply(prop_missing, axis=1)
pcom_row = titanic.apply(prop_complete, axis=1)
print(cmis_row.value_counts())
titanic['num_missing'] = titanic.apply(count_missing, axis=1)
docs['name_lamb'] = docs[0].apply(lambda x: p.match(x).group())

avg_life_exp_by_year = df.groupby('year').lifeExp.mean()
avg_life_exp_by_year = df.groupby('year')['lifeExp'].mean()
# get a list of unique years in the data
years = df.year.unique()
# subset the data for the year 1952
y1952 = df.loc[df.year == 1952, :]
y1952_mean = y1952.lifeExp.mean()
cont_le_agg2 = df.groupby('continent').lifeExp.aggregate(np.mean)

import seaborn as sns


import numpy as np
np.random.seed(42)
# sample 10 rows from tips
tips_10 = sns.load_dataset('tips').sample(10)
tips = sns.load_dataset('tips')
print(tips['size'].value_counts())
# filter the data such that each group has more than 30 observations
tips_filtered = tips.groupby('size').filter(lambda x: x['size'].count() >= 30)

print(tips_filtered['size'].value_counts())
# list all the columns
print(tips_10.columns)

# get the 'Female' group


female = grouped.get_group('Female')

from datetime import datetime


now = datetime.now()
t2 = datetime(1970, 1, 1)
diff = now - t2
import pandas as pd
ebola = pd.read_csv('../data/country_timeseries.csv')
print(ebola.info())
ebola['date_dt'] = pd.to_datetime(ebola['Date'])
ebola['date_dt'] = pd.to_datetime(ebola['Date'], format='%m/%d/%Y')
print(ebola.info())
d = pd.to_datetime('2016-02-29')
print(d.year)
print(d.month)
print(d.day)
ebola['date_dt'] = pd.to_datetime(ebola['Date'])
ebola['year'] = ebola['date_dt'].dt.year
print(ebola['date_dt'].min())
banks['closing_quarter'], banks['closing_year'] = \     (banks['Closing
Date'].dt.quarter,      banks['Closing Date'].dt.year)
closing_year = banks.groupby(['closing_year']).size()
print(tesla.loc[(tesla.Date.dt.year == 2010) & \(tesla.Date.dt.month == 6)])
tesla.index = tesla['Date']
head_range = pd.date_range(star='2014-12-31', en='2015-01-05')

You might also like