Pandas Syntax Revision For ML
Pandas Syntax Revision For ML
Reading Data
# Basic data loading
df = pd.read_csv('file.csv')
df = pd.read_csv('file.csv', low_memory=False) # Prevents mixed data types
df = pd.read_csv('file.csv', index_col=0) # Set first column as index
df = pd.read_csv('file.csv', parse_dates=['date_col']) # Parse dates automatically
# Column information
df.columns # Column names
df.dtypes # Data types of each column
df.index # Index information
df.head(10) # First 10 rows
df.tail(5) # Last 5 rows
df.sample(3) # Random 3 rows
Column Selection
# Single column
df['column_name'] # Returns Series
df[['column_name']] # Returns DataFrame
# Multiple columns
df[['col1', 'col2', 'col3']]
Row Selection
# By index position
df.iloc[^0] # First row
df.iloc[0:5] # First 5 rows
df.iloc[:, 0:3] # All rows, first 3 columns
df.iloc[0:5, 0:3] # First 5 rows, first 3 columns
# By label
df.loc[^0] # Row with index 0
df.loc[0:4] # Rows with index 0 to 4 (inclusive)
df.loc[:, 'col1':'col3'] # All rows, columns from col1 to col3
df.loc[df['column'] > 5] # Conditional selection
Boolean Indexing
# Single condition
df[df['age'] > 25]
df[df['category'] == 'A']
df[df['name'].str.contains('John')]
# Multiple conditions
df[(df['age'] > 25) & (df['salary'] < 50000)]
df[(df['category'] == 'A') | (df['category'] == 'B')]
df[df['column'].isin(['value1', 'value2', 'value3'])]
# Negation
df[~df['column'].isin(['unwanted_value'])]
df[df['column'] != 'unwanted_value']
Duplicate Handling
# Finding duplicates
df.duplicated().sum() # Count of duplicate rows
df.duplicated(subset=['col1']).sum() # Duplicates based on specific column
# Removing duplicates
df.drop_duplicates() # Remove duplicate rows
df.drop_duplicates(subset=['col1']) # Remove duplicates based on column
df.drop_duplicates(keep='last') # Keep last occurrence
Data Transformation
String Operations
# Basic string operations
df['column'].str.lower() # Convert to lowercase
df['column'].str.upper() # Convert to uppercase
df['column'].str.strip() # Remove leading/trailing whitespace
df['column'].str.replace('old', 'new') # Replace text
# String conditions
df['column'].str.contains('pattern')
df['column'].str.startswith('prefix')
df['column'].str.endswith('suffix')
df['column'].str.len() # Length of strings
Date and Time Operations
# Converting to datetime
df['date'] = pd.to_datetime(df['date'])
# Date arithmetic
df['days_ago'] = (pd.Timestamp.now() - df['date']).dt.days
df['date_plus_30'] = df['date'] + pd.Timedelta(days=30)
Advanced Aggregation
# Multiple aggregations
df.groupby('category').agg({
'price': ['mean', 'std'],
'quantity': 'sum',
'date': 'count'
})
# Custom aggregation functions
df.groupby('category')['price'].agg(lambda x: x.max() - x.min())
df.groupby('category').apply(custom_function)
# Filter groups
df.groupby('category').filter(lambda x: len(x) > 10) # Groups with >10 records
Concatenation
# Vertical concatenation (stacking rows)
combined = pd.concat([df1, df2], axis=0, ignore_index=True)
Merging DataFrames
# Inner join (only matching records)
merged = pd.merge(df1, df2, on='key_column', how='inner')
Reshaping Data
Pivot Operations
# Pivot table
pivot = df.pivot_table(values='sales',
index='date',
columns='product',
aggfunc='sum')
# Simple pivot
pivot = df.pivot(index='date', columns='product', values='sales')
# Multi-level operations
df.unstack(level=0)
df.unstack(level='column_name')
Statistical Operations
Descriptive Statistics
# Basic statistics
df['column'].mean()
df['column'].median()
df['column'].std()
df['column'].var()
df['column'].min()
df['column'].max()
df['column'].quantile(0.25) # 25th percentile
# Correlation
df.corr() # Correlation matrix
df['col1'].corr(df['col2']) # Correlation between two columns
# Unique values
df['column'].unique()
df['column'].nunique() # Number of unique values
# Quantile-based binning
df['price_quartile'] = pd.qcut(df['price'], q=4, labels=['Low', 'Med-Low', 'Med-High', 'High'])
# Lag features
df['value_lag1'] = df['value'].shift(1)
df['value_lag7'] = df['value'].shift(7)
Memory Optimization
# Reduce memory usage
def reduce_mem_usage(df):
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
return df
# Frequency encoding
freq_encoding = train_df['category'].value_counts()
train_df['category_freq'] = train_df['category'].map(freq_encoding)
# Interaction features
train_df['feature_interaction'] = train_df['feature1'] * train_df['feature2']
These pandas operations form the backbone of most data science workflows and Kaggle
competitions.