0% found this document useful (0 votes)

7 views

Note

Uploaded by

64h8tnp7cp

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

7 views

Note

Uploaded by

64h8tnp7cp

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 9

My notes

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.cluster import KMeans

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score,

confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler

df = df.drop_duplicates()

df['Year'] = pd.to_datetime(df['Year'], format = '%Y', errors = 'coerce')

def convert_new_price(price):

price_str = price.strip().replace(',', '')

if ' Lakh' in price_str:

return float(price_str.replace(' Lakh', '')) * 100000

elif ' Cr' in price_str:

return float(price_str.replace(' Cr', '')) * 10000000 # 1 Crore = 10,000,000

else:
return float(price_str)

current year and aging

current_year = pd.to_datetime("now").year

df['Car_Age'] = current_year - df['Year'].dt.year

sns.histplot(df[], bins=, kde=True, color=)

plt.title()

plt.xlabel()

plt.ylabel()

fuel_counts = df['Fuel_Type'].value_counts()

plt.bar(fuel_counts.index, fuel_counts.values, color='skyblue')

plt.title('Distribution of Fuel Type')

plt.xlabel('Fuel Type')

plt.ylabel('Count')

plt.xticks(rotation=45)

plt.show()

sns.boxplot(data=df, x='Fuel_Type', y='Price', palette='pastel')

sns.lineplot(data=avg_price, x='Year', y='Price', marker='o', color='teal')

sns.scatterplot(data=df, x='Kilometers_Driven', y='Price', color='purple', alpha=0.6)

summary_table = df.groupby('Fuel_Type').agg({

'Price': ['mean', 'median'],

'Mileage': ['mean', 'median'],

'Engine': ['mean', 'median']

}).reset_index()

summary_table.columns = ['Fuel_Type', 'Mean_Price', 'Median_Price', 'Mean_Mileage',

'Median_Mileage', 'Mean_Engine', 'Median_Engine']

print("\nSummary Table:\n", summary_table)

avg_price = df.groupby(['Fuel_Type', 'Transmission', 'Owner_Type'])['Price'].mean().reset_index()

km_driven_by_location = df.groupby('Location')['Kilometers_Driven'].sum().reset_index()

# Convert Engine to numeric

df['Engine'] = df['Engine'].astype(str).str.replace(' CC', '',
regex=False)
df['Engine'] = pd.to_numeric(df['Engine'], errors='coerce')

# change the Power too

df['Mileage'] = pd.to_numeric(df['Mileage'].str.replace('
km/kg', '').str.replace(' kmpl', '').astype(str),
errors='coerce')
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
#heat map
correlation = df[['Engine', 'Power', 'Mileage',
'Price']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (Numeric Variables)')
plt.show()

features = ['Car_Age', 'Kilometers_Driven', 'Engine', 'Power']

X = df[features]

y = df['Price']

# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model

model = LinearRegression()

model.fit(X_train, y_train)

# Make predictions

y_pred = model.predict(X_test)

# Evaluate the model

mae = mean_absolute_error(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}, MSE: {mse}, R²: {r2}')

from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# Select numerical columns

features_kmeans = ['Kilometers_Driven', 'Car_Age', 'Engine', 'Power', 'Mileage']

X_kmeans = df[features_kmeans].dropna() # Remove rows with NaN

# Normalize features

scaler = StandardScaler()

X_kmeans_scaled = scaler.fit_transform(X_kmeans)

# Elbow method to find optimal number of clusters

inertia = []

for i in range(1, 11):

kmeans = KMeans(n_clusters=i, random_state=42)

kmeans.fit(X_kmeans_scaled)

inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))

plt.plot(range(1, 11), inertia, marker='o')

plt.title('Elbow Method')

plt.xlabel('Number of Clusters')

plt.ylabel('Inertia')

plt.show()

# Applying K-means with optimal clusters (assume 3 clusters based on elbow)

optimal_clusters = 3

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)

df['Cluster'] = kmeans.fit_predict(X_kmeans_scaled)
# Visualizing clusters (using first two features for simplicity)

plt.figure(figsize=(8, 5))

plt.scatter(X_kmeans_scaled[:, 0], X_kmeans_scaled[:, 1], c=df['Cluster'], cmap='viridis', alpha=0.5)

plt.title('K-Means Clustering of Cars')

plt.xlabel(features_kmeans[0]) # Kilometers_Driven

plt.ylabel(features_kmeans[1]) # Car_Age

plt.show()

# K-Means Clustering

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)

df['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 5))

sns.scatterplot(data=df, x='Kilometers_Driven', y='Price', hue='Cluster', palette='viridis')

plt.title('K-Means Clustering (3 Clusters)')

plt.show()

# Linear Regression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)

print("Linear Regression MAE:", mean_absolute_error(y_test, y_pred))

print("Linear Regression MSE:", mean_squared_error(y_test, y_pred))

print("Linear Regression R²:", r2_score(y_test, y_pred))

# Logistic Regression
df['Price_Category'] = (df['Price'] > df['Price'].median()).astype(int)

y = df['Price_Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=1000)

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

1. Pair Plot

Visualization: A pair plot allows you to visualize relationships between multiple numerical
features at once, showing scatter plots for each pair of variables.

sns.pairplot(df, vars=['Price', 'Kilometers_Driven', 'Car_Age', 'Engine',

'Power'], diag_kind='kde')
plt.suptitle('Pair Plot of Numerical Features', y=1.02)
plt.show()

Explanation: This plot helps you identify correlations and relationships between features,
such as whether higher engine power is associated with higher prices or if older cars tend to
have lower prices.

2. Violin Plot

Visualization: A violin plot can show the distribution of prices across different categories
(like Fuel_Type).

plt.figure(figsize=(10, 6))
sns.violinplot(x='Fuel_Type', y='Price', data=df, inner='quartile')
plt.title('Price Distribution by Fuel Type')
plt.show()

Explanation: This plot provides insight into how prices are distributed for different fuel
types, revealing the range, median, and potential outliers in the data.

3. Box Plot for Price by Owner Type

Visualization: A box plot can compare the price distributions across different owner types.

python
Copy code
plt.figure(figsize=(10, 6))
sns.boxplot(x='Owner_Type', y='Price', data=df)
plt.title('Price Distribution by Owner Type')
plt.show()

Explanation: This visualization helps understand how the price varies with the number of
previous owners. It can show which owner types tend to have higher or lower prices,
revealing patterns in the used car market.

4. Heatmap of Categorical Variables

Visualization: A heatmap showing the average price for combinations of Fuel_Type and
Transmission can illustrate how these features interact.

python
Copy code
price_heatmap = df.groupby(['Fuel_Type', 'Transmission'])
['Price'].mean().unstack()
plt.figure(figsize=(10, 6))
sns.heatmap(price_heatmap, annot=True, cmap='YlGnBu')
plt.title('Average Price by Fuel Type and Transmission')
plt.ylabel('Fuel Type')
plt.xlabel('Transmission')
plt.show()

Explanation: This heatmap highlights how average prices vary with different combinations
of fuel types and transmission types, allowing for quick comparisons between categories.

5. Scatter Plot with Trend Line

Visualization: A scatter plot of Price vs. Engine with a regression line can visualize
relationships.

python
Copy code
plt.figure(figsize=(10, 6))
sns.regplot(x='Engine', y='Price', data=df)
plt.title('Price vs. Engine Size with Regression Line')
plt.show()

Explanation: This plot helps to visualize the relationship between engine size and price,
showing trends and potential outliers. The regression line can indicate whether larger engines
tend to lead to higher prices.

6. Bar Chart of Average Mileage by Fuel Type

Visualization: A bar chart showing average mileage by fuel type can highlight efficiency
differences.

python
Copy code
average_mileage = df.groupby('Fuel_Type')['Mileage'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(x='Fuel_Type', y='Mileage', data=average_mileage)
plt.title('Average Mileage by Fuel Type')
plt.show()

Explanation: This visualization can provide insights into which fuel types tend to be more
fuel-efficient, potentially influencing purchasing decisions.

7. Cumulative Distribution Function (CDF)

Visualization: A CDF can show the probability that a car's price is less than or equal to a
certain value.

python
Copy code
plt.figure(figsize=(10, 6))
sns.ecdfplot(df['Price'])
plt.title('Cumulative Distribution Function of Car Prices')
plt.xlabel('Price')
plt.ylabel('Cumulative Probability')
plt.show()

Explanation: This plot helps to understand the distribution of car prices across the dataset,
indicating what percentage of cars are priced below a certain threshold.

Petroleum & Natural Gas Industries - Well Integrity - Life Cycle Governance - IsO 16530-1
100% (3)
Petroleum & Natural Gas Industries - Well Integrity - Life Cycle Governance - IsO 16530-1
134 pages
Data Mining
No ratings yet
Data Mining
10 pages
Frozen Ground Engineering Second Edition: Orlando B. Andersland Branko Ladanyi
No ratings yet
Frozen Ground Engineering Second Edition: Orlando B. Andersland Branko Ladanyi
2 pages
The AutonomatriX Manifesto
100% (1)
The AutonomatriX Manifesto
3 pages
APMOPS 2014 R2 Solution
0% (1)
APMOPS 2014 R2 Solution
7 pages
CSWIP 3.2 LATEST WIS10-Mar-2011
100% (9)
CSWIP 3.2 LATEST WIS10-Mar-2011
303 pages
Overall Equipment Effectiveness
No ratings yet
Overall Equipment Effectiveness
32 pages
Eda 1
No ratings yet
Eda 1
29 pages
car-price-prediction-1 (1)
No ratings yet
car-price-prediction-1 (1)
24 pages
Data Analysis Report
No ratings yet
Data Analysis Report
74 pages
temp 2 Lab 1
No ratings yet
temp 2 Lab 1
5 pages
Car Price Prediction
No ratings yet
Car Price Prediction
35 pages
Car Price Prediction
No ratings yet
Car Price Prediction
72 pages
Cars Sales Dashboard
No ratings yet
Cars Sales Dashboard
19 pages
Engo 645
No ratings yet
Engo 645
10 pages
Data Frames and Charts 2: 2.1 Dealing With Missing Values
No ratings yet
Data Frames and Charts 2: 2.1 Dealing With Missing Values
12 pages
Trilokesh Assignment
No ratings yet
Trilokesh Assignment
15 pages
Internship
No ratings yet
Internship
23 pages
Ilovepdf Merged3
No ratings yet
Ilovepdf Merged3
5 pages
9587_9638_9563_ADS_exp1.ipynb - Colab
No ratings yet
9587_9638_9563_ADS_exp1.ipynb - Colab
8 pages
Finalised FBA CIA 3
No ratings yet
Finalised FBA CIA 3
16 pages
Exploratiory data analysis
No ratings yet
Exploratiory data analysis
26 pages
SVM (Support Vector Machine) For Classification - by Aditya Kumar - Towards Data Science
100% (1)
SVM (Support Vector Machine) For Classification - by Aditya Kumar - Towards Data Science
28 pages
Car Price Prediction Project
No ratings yet
Car Price Prediction Project
34 pages
Exp_5_Exploratory_Data_Analysis_sdk_ok
No ratings yet
Exp_5_Exploratory_Data_Analysis_sdk_ok
13 pages
DV ca-1
No ratings yet
DV ca-1
9 pages
Practical Example Full Notes
No ratings yet
Practical Example Full Notes
48 pages
Linear Regression
100% (1)
Linear Regression
16 pages
Xətti Reqressiya Modelinin Qurulması
No ratings yet
Xətti Reqressiya Modelinin Qurulması
4 pages
Team AN
No ratings yet
Team AN
23 pages
Eda Notes
No ratings yet
Eda Notes
4 pages
City Cycle Fuel Consumption 2024
No ratings yet
City Cycle Fuel Consumption 2024
23 pages
DAV_WEEK8_240953580
No ratings yet
DAV_WEEK8_240953580
15 pages
Report
No ratings yet
Report
4 pages
vertopal.com_UCD_linear_reg2
No ratings yet
vertopal.com_UCD_linear_reg2
3 pages
Xii Project PDF
No ratings yet
Xii Project PDF
19 pages
Pyt On Visualization
No ratings yet
Pyt On Visualization
50 pages
Cs
No ratings yet
Cs
9 pages
Car prediction - Colab
No ratings yet
Car prediction - Colab
8 pages
Car Price Prediction
No ratings yet
Car Price Prediction
18 pages
DMPM-LAB-03-Assignment: Rcode
No ratings yet
DMPM-LAB-03-Assignment: Rcode
9 pages
01_Seaborn_Intro
No ratings yet
01_Seaborn_Intro
10 pages
Laptop Price Prediction
No ratings yet
Laptop Price Prediction
15 pages
4
No ratings yet
4
9 pages
Practical 2 .Ipynb - Colab (1) - Copy (1)
No ratings yet
Practical 2 .Ipynb - Colab (1) - Copy (1)
9 pages
Lab1 for module3- Python code (1)
No ratings yet
Lab1 for module3- Python code (1)
10 pages
elite-sports-cars-eda
No ratings yet
elite-sports-cars-eda
9 pages
Quikr Car Price Prediction Using Linear Regression 1717999953
No ratings yet
Quikr Car Price Prediction Using Linear Regression 1717999953
12 pages
Data Vizualization - Jupyter Notebook
No ratings yet
Data Vizualization - Jupyter Notebook
20 pages
Project 8 Predictive Analytics - Ipynb - Colaboratory
No ratings yet
Project 8 Predictive Analytics - Ipynb - Colaboratory
8 pages
04_boxplot
No ratings yet
04_boxplot
22 pages
#1 - Skill Builds - Data Analysis With Python
No ratings yet
#1 - Skill Builds - Data Analysis With Python
3 pages
Exp_6-Model Development_sdk_ok
No ratings yet
Exp_6-Model Development_sdk_ok
11 pages
Automobile_Linear_Regression
No ratings yet
Automobile_Linear_Regression
1 page
Veri Görselleştirme
No ratings yet
Veri Görselleştirme
33 pages
Electric Vehicle Range Prediction-Regression Analysis
No ratings yet
Electric Vehicle Range Prediction-Regression Analysis
37 pages
ML_recordjp
No ratings yet
ML_recordjp
35 pages
DS_on_MTCARS_Solutions
No ratings yet
DS_on_MTCARS_Solutions
3 pages
ML Foram
No ratings yet
ML Foram
17 pages
1.5 Data Analysis with Python- Exploratory Data Analysis 1
No ratings yet
1.5 Data Analysis with Python- Exploratory Data Analysis 1
17 pages
Project - Analyzing The Impact of Car Features On Price and Profitability
No ratings yet
Project - Analyzing The Impact of Car Features On Price and Profitability
8 pages
Car Price Detection Based On The Travelling Distance
No ratings yet
Car Price Detection Based On The Travelling Distance
15 pages
Grafik
No ratings yet
Grafik
4 pages
Aayushi Bda File
No ratings yet
Aayushi Bda File
41 pages
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Android Tutorial
No ratings yet
Android Tutorial
48 pages
Good Ethics Makes Good Businesses
100% (1)
Good Ethics Makes Good Businesses
60 pages
PrecipitationShape Memory Alloy of Second Phases in Aged Ni Rich
No ratings yet
PrecipitationShape Memory Alloy of Second Phases in Aged Ni Rich
9 pages
Taxonomy of Angiosperms - Full - Omnipage Work
100% (1)
Taxonomy of Angiosperms - Full - Omnipage Work
811 pages
GE 4 - Science, Technology, and Society Week 3 & 4
No ratings yet
GE 4 - Science, Technology, and Society Week 3 & 4
6 pages
Elements of Site Development Planning
0% (1)
Elements of Site Development Planning
8 pages
Annexure - I (Questionnaire) : "A Study On Management of Work Stress"
No ratings yet
Annexure - I (Questionnaire) : "A Study On Management of Work Stress"
4 pages
Elzaki Transform and Integro-Differential Equation With A Bulge Function
No ratings yet
Elzaki Transform and Integro-Differential Equation With A Bulge Function
4 pages
2022 GESAC G-PAK New 2022.11
No ratings yet
2022 GESAC G-PAK New 2022.11
78 pages
Steven Mcfadyen
No ratings yet
Steven Mcfadyen
2 pages
GIW WI-HQ1-019 - ASTM D5994 Tx. Sheet Thickness
No ratings yet
GIW WI-HQ1-019 - ASTM D5994 Tx. Sheet Thickness
3 pages
English Activity Sheet: Quarter 3 - MELC 5
No ratings yet
English Activity Sheet: Quarter 3 - MELC 5
9 pages
Identifying Hazards and Managing Risk
No ratings yet
Identifying Hazards and Managing Risk
31 pages
Call Letter
No ratings yet
Call Letter
1 page
Buy ebook China s War With Japan 1937 1945 The Struggle for Survival 1st Edition Rana Mitter cheap price
100% (10)
Buy ebook China s War With Japan 1937 1945 The Struggle for Survival 1st Edition Rana Mitter cheap price
60 pages
Field Soil Physical Measurements
No ratings yet
Field Soil Physical Measurements
46 pages
Swot Dole
No ratings yet
Swot Dole
5 pages
Koforidua Polytechnic Department of Applied Mathematics Final Year Project Work Guide
0% (1)
Koforidua Polytechnic Department of Applied Mathematics Final Year Project Work Guide
6 pages
Adhyapan: Allowed
No ratings yet
Adhyapan: Allowed
7 pages
House Resolution 497
No ratings yet
House Resolution 497
4 pages
ABYIP
No ratings yet
ABYIP
7 pages
Advanced Project Portfolio Management and The PMO: Multiplying ROI at Warp Speed
No ratings yet
Advanced Project Portfolio Management and The PMO: Multiplying ROI at Warp Speed
7 pages
Monismith Lecture
100% (1)
Monismith Lecture
77 pages
MediaTek Linux SDK Release Notes
No ratings yet
MediaTek Linux SDK Release Notes
34 pages

Note

Uploaded by

Note

Uploaded by

My notes

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.cluster import KMeans

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score,

from sklearn.preprocessing import StandardScaler

df['Year'] = pd.to_datetime(df['Year'], format = '%Y', errors = 'coerce')

price_str = price.strip().replace(',', '')

if ' Lakh' in price_str:

return float(price_str.replace(' Lakh', '')) * 100000

elif ' Cr' in price_str:

return float(price_str.replace(' Cr', '')) * 10000000 # 1 Crore = 10,000,000

current year and aging

df['Car_Age'] = current_year - df['Year'].dt.year

sns.histplot(df[], bins=, kde=True, color=)

plt.bar(fuel_counts.index, fuel_counts.values, color='skyblue')

plt.title('Distribution of Fuel Type')

sns.boxplot(data=df, x='Fuel_Type', y='Price', palette='pastel')

sns.lineplot(data=avg_price, x='Year', y='Price', marker='o', color='teal')

'Price': ['mean', 'median'],

'Mileage': ['mean', 'median'],

'Engine': ['mean', 'median']

summary_table.columns = ['Fuel_Type', 'Mean_Price', 'Median_Price', 'Mean_Mileage',

print("\nSummary Table:\n", summary_table)

avg_price = df.groupby(['Fuel_Type', 'Transmission', 'Owner_Type'])['Price'].mean().reset_index()

# Convert Engine to numeric

# change the Power too

features = ['Car_Age', 'Kilometers_Driven', 'Engine', 'Power']

# Split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model

# Evaluate the model

mae = mean_absolute_error(y_test, y_pred)

mse = mean_squared_error(y_test, y_pred)

print(f'MAE: {mae}, MSE: {mse}, R²: {r2}')

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# Select numerical columns

features_kmeans = ['Kilometers_Driven', 'Car_Age', 'Engine', 'Power', 'Mileage']

X_kmeans = df[features_kmeans].dropna() # Remove rows with NaN

# Elbow method to find optimal number of clusters

for i in range(1, 11):

kmeans = KMeans(n_clusters=i, random_state=42)

plt.plot(range(1, 11), inertia, marker='o')

# Applying K-means with optimal clusters (assume 3 clusters based on elbow)

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)

plt.scatter(X_kmeans_scaled[:, 0], X_kmeans_scaled[:, 1], c=df['Cluster'], cmap='viridis', alpha=0.5)

plt.title('K-Means Clustering of Cars')

kmeans = KMeans(n_clusters=3, random_state=42)

sns.scatterplot(data=df, x='Kilometers_Driven', y='Price', hue='Cluster', palette='viridis')

plt.title('K-Means Clustering (3 Clusters)')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Linear Regression MAE:", mean_absolute_error(y_test, y_pred))

print("Linear Regression MSE:", mean_squared_error(y_test, y_pred))

print("Linear Regression R²:", r2_score(y_test, y_pred))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

print("Classification Report:\n", classification_report(y_test, y_pred))

sns.pairplot(df, vars=['Price', 'Kilometers_Driven', 'Car_Age', 'Engine',

3. Box Plot for Price by Owner Type

4. Heatmap of Categorical Variables

5. Scatter Plot with Trend Line

6. Bar Chart of Average Mileage by Fuel Type

7. Cumulative Distribution Function (CDF)

You might also like