Note

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 9

My notes

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.cluster import KMeans

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score,


confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler

df = df.drop_duplicates()

df['Year'] = pd.to_datetime(df['Year'], format = '%Y', errors = 'coerce')

def convert_new_price(price):

price_str = price.strip().replace(',', '')

if ' Lakh' in price_str:

return float(price_str.replace(' Lakh', '')) * 100000

elif ' Cr' in price_str:

return float(price_str.replace(' Cr', '')) * 10000000 # 1 Crore = 10,000,000

else:
return float(price_str)

current year and aging

current_year = pd.to_datetime("now").year

df['Car_Age'] = current_year - df['Year'].dt.year

sns.histplot(df[], bins=, kde=True, color=)

plt.title()

plt.xlabel()

plt.ylabel()

fuel_counts = df['Fuel_Type'].value_counts()

plt.bar(fuel_counts.index, fuel_counts.values, color='skyblue')

plt.title('Distribution of Fuel Type')

plt.xlabel('Fuel Type')

plt.ylabel('Count')

plt.xticks(rotation=45)

plt.show()

sns.boxplot(data=df, x='Fuel_Type', y='Price', palette='pastel')

sns.lineplot(data=avg_price, x='Year', y='Price', marker='o', color='teal')


sns.scatterplot(data=df, x='Kilometers_Driven', y='Price', color='purple', alpha=0.6)

summary_table = df.groupby('Fuel_Type').agg({

'Price': ['mean', 'median'],

'Mileage': ['mean', 'median'],

'Engine': ['mean', 'median']

}).reset_index()

summary_table.columns = ['Fuel_Type', 'Mean_Price', 'Median_Price', 'Mean_Mileage',


'Median_Mileage', 'Mean_Engine', 'Median_Engine']

print("\nSummary Table:\n", summary_table)

avg_price = df.groupby(['Fuel_Type', 'Transmission', 'Owner_Type'])['Price'].mean().reset_index()

km_driven_by_location = df.groupby('Location')['Kilometers_Driven'].sum().reset_index()

# Convert Engine to numeric


df['Engine'] = df['Engine'].astype(str).str.replace(' CC', '',
regex=False)
df['Engine'] = pd.to_numeric(df['Engine'], errors='coerce')

# change the Power too

df['Mileage'] = pd.to_numeric(df['Mileage'].str.replace('
km/kg', '').str.replace(' kmpl', '').astype(str),
errors='coerce')
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
#heat map
correlation = df[['Engine', 'Power', 'Mileage',
'Price']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (Numeric Variables)')
plt.show()

features = ['Car_Age', 'Kilometers_Driven', 'Engine', 'Power']

X = df[features]

y = df['Price']

# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model

model = LinearRegression()

model.fit(X_train, y_train)

# Make predictions

y_pred = model.predict(X_test)

# Evaluate the model

mae = mean_absolute_error(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}, MSE: {mse}, R²: {r2}')


from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# Select numerical columns

features_kmeans = ['Kilometers_Driven', 'Car_Age', 'Engine', 'Power', 'Mileage']

X_kmeans = df[features_kmeans].dropna() # Remove rows with NaN

# Normalize features

scaler = StandardScaler()

X_kmeans_scaled = scaler.fit_transform(X_kmeans)

# Elbow method to find optimal number of clusters

inertia = []

for i in range(1, 11):

kmeans = KMeans(n_clusters=i, random_state=42)

kmeans.fit(X_kmeans_scaled)

inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))

plt.plot(range(1, 11), inertia, marker='o')

plt.title('Elbow Method')

plt.xlabel('Number of Clusters')

plt.ylabel('Inertia')

plt.show()

# Applying K-means with optimal clusters (assume 3 clusters based on elbow)

optimal_clusters = 3

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)

df['Cluster'] = kmeans.fit_predict(X_kmeans_scaled)
# Visualizing clusters (using first two features for simplicity)

plt.figure(figsize=(8, 5))

plt.scatter(X_kmeans_scaled[:, 0], X_kmeans_scaled[:, 1], c=df['Cluster'], cmap='viridis', alpha=0.5)

plt.title('K-Means Clustering of Cars')

plt.xlabel(features_kmeans[0]) # Kilometers_Driven

plt.ylabel(features_kmeans[1]) # Car_Age

plt.show()

# K-Means Clustering

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)

df['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 5))

sns.scatterplot(data=df, x='Kilometers_Driven', y='Price', hue='Cluster', palette='viridis')

plt.title('K-Means Clustering (3 Clusters)')

plt.show()

# Linear Regression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)

print("Linear Regression MAE:", mean_absolute_error(y_test, y_pred))

print("Linear Regression MSE:", mean_squared_error(y_test, y_pred))

print("Linear Regression R²:", r2_score(y_test, y_pred))

# Logistic Regression
df['Price_Category'] = (df['Price'] > df['Price'].median()).astype(int)

y = df['Price_Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=1000)

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

1. Pair Plot

Visualization: A pair plot allows you to visualize relationships between multiple numerical
features at once, showing scatter plots for each pair of variables.

sns.pairplot(df, vars=['Price', 'Kilometers_Driven', 'Car_Age', 'Engine',


'Power'], diag_kind='kde')
plt.suptitle('Pair Plot of Numerical Features', y=1.02)
plt.show()

Explanation: This plot helps you identify correlations and relationships between features,
such as whether higher engine power is associated with higher prices or if older cars tend to
have lower prices.

2. Violin Plot

Visualization: A violin plot can show the distribution of prices across different categories
(like Fuel_Type).

plt.figure(figsize=(10, 6))
sns.violinplot(x='Fuel_Type', y='Price', data=df, inner='quartile')
plt.title('Price Distribution by Fuel Type')
plt.show()

Explanation: This plot provides insight into how prices are distributed for different fuel
types, revealing the range, median, and potential outliers in the data.

3. Box Plot for Price by Owner Type


Visualization: A box plot can compare the price distributions across different owner types.

python
Copy code
plt.figure(figsize=(10, 6))
sns.boxplot(x='Owner_Type', y='Price', data=df)
plt.title('Price Distribution by Owner Type')
plt.show()

Explanation: This visualization helps understand how the price varies with the number of
previous owners. It can show which owner types tend to have higher or lower prices,
revealing patterns in the used car market.

4. Heatmap of Categorical Variables

Visualization: A heatmap showing the average price for combinations of Fuel_Type and
Transmission can illustrate how these features interact.

python
Copy code
price_heatmap = df.groupby(['Fuel_Type', 'Transmission'])
['Price'].mean().unstack()
plt.figure(figsize=(10, 6))
sns.heatmap(price_heatmap, annot=True, cmap='YlGnBu')
plt.title('Average Price by Fuel Type and Transmission')
plt.ylabel('Fuel Type')
plt.xlabel('Transmission')
plt.show()

Explanation: This heatmap highlights how average prices vary with different combinations
of fuel types and transmission types, allowing for quick comparisons between categories.

5. Scatter Plot with Trend Line

Visualization: A scatter plot of Price vs. Engine with a regression line can visualize
relationships.

python
Copy code
plt.figure(figsize=(10, 6))
sns.regplot(x='Engine', y='Price', data=df)
plt.title('Price vs. Engine Size with Regression Line')
plt.show()

Explanation: This plot helps to visualize the relationship between engine size and price,
showing trends and potential outliers. The regression line can indicate whether larger engines
tend to lead to higher prices.

6. Bar Chart of Average Mileage by Fuel Type

Visualization: A bar chart showing average mileage by fuel type can highlight efficiency
differences.

python
Copy code
average_mileage = df.groupby('Fuel_Type')['Mileage'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(x='Fuel_Type', y='Mileage', data=average_mileage)
plt.title('Average Mileage by Fuel Type')
plt.show()

Explanation: This visualization can provide insights into which fuel types tend to be more
fuel-efficient, potentially influencing purchasing decisions.

7. Cumulative Distribution Function (CDF)

Visualization: A CDF can show the probability that a car's price is less than or equal to a
certain value.

python
Copy code
plt.figure(figsize=(10, 6))
sns.ecdfplot(df['Price'])
plt.title('Cumulative Distribution Function of Car Prices')
plt.xlabel('Price')
plt.ylabel('Cumulative Probability')
plt.show()

Explanation: This plot helps to understand the distribution of car prices across the dataset,
indicating what percentage of cars are priced below a certain threshold.

You might also like