0% found this document useful (0 votes)
0 views

Coding Notes Data Science

All coding related to data science

Uploaded by

Saman
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
0 views

Coding Notes Data Science

All coding related to data science

Uploaded by

Saman
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 4

Coding Notes Data Science

pd.read_csv
import pandas as pd

# reading csv file


df = pd.read_csv("people.csv")
df
pd.read_excel
pip install pandas
pip install xlrd
import pandas as pd
df = pd.read_excel("sample.xlsx")
print(df)
pd.read_sql
# import the modules
import pandas as pd
from sqlalchemy import create_engine

# SQLAlchemy connectable
cnx = create_engine('sqlite:///contacts.db ').connect()

# table named 'contacts' will be returned as a dataframe.


df = pd.read_sql_table('contacts', cnx)
print(df)
pd.read_table
# importing pandas
import pandas as pd

pd.read_table('people.csv', delimiter=',')
Clean a real world messy dataset (eg: Kaggle)
# modules we'll use
import pandas as pd
import numpy as np

# read in all our data


nfl_data = pd.read_csv("../input/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")

# set seed for reproducibility


np.random.seed(0)
# look at the first five rows of the nfl_data file.
# I can see a handful of missing data already!
nfl_data.head()
# get the number of missing data points per column
missing_values_count = nfl_data.isnull().sum()

# look at the # of missing points in the first ten columns


missing_values_count[0:10]
# how many total missing values do we have?
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing


percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
# look at the # of missing points in the first ten columns
missing_values_count[0:10]
# remove all the rows that contain a missing value
nfl_data.dropna()
# remove all columns with at least one missing value
columns_with_na_dropped = nfl_data.dropna(axis=1)
columns_with_na_dropped.head()
# just how much data did we lose?
print("Columns in original dataset: %d \n" % nfl_data.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])
# get a small subset of the NFL dataset
subset_nfl_data = nfl_data.loc[:, 'EPA':'Season'].head()
subset_nfl_data
# replace all NA's with 0
subset_nfl_data.fillna(0)
# replace all NA's the value that comes directly after it in the same column,
# then replace all the remaining na's with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)
Apply EDA on a student performance dataset
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('/kaggle/input/students-performance-in-exams/
StudentsPerformance.csv')
df.info()
df.describe()
df.shape
df.isnull().sum() #checks if there are any missing values
Lets start with plotting graphs
plt.rcParams['figure.figsize'] = (20, 10)
sns.countplot(df['math score'], palette = 'dark')
plt.title('Math Score',fontsize = 20)
plt.show()
To analyse the data in more deeper way, lets few new columns: Total marks, Percentage and Grades.

df['total marks']=df['math score']+df['reading score']+df['writing score']


df['percentage']=df['total marks']/300*100
#Assigning the grades

def determine_grade(scores):
if scores >= 85 and scores <= 100:
return 'Grade A'
elif scores >= 70 and scores < 85:
return 'Grade B'
elif scores >= 55 and scores < 70:
return 'Grade C'
elif scores >= 35 and scores < 55:
return 'Grade D'
elif scores >= 0 and scores < 35:
return 'Grade E'

df['grades']=df['percentage'].apply(determine_grade)
df.info()
df['grades'].value_counts().plot.pie(autopct="%1.1f%%")
plt.show()
Implementation of Linear Regression Model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample data (replace with your actual data)


X = np.array([[1], [2], [3], [4], [5]]) # Independent variable (features)
y = np.array([2, 4, 5, 4, 5]) # Dependent variable (target)

# Split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model


model = LinearRegression()

# Train the model using the training data


model.fit(X_train, y_train)

# Make predictions on the test data


y_pred = model.predict(X_test)

# Evaluate the model


mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients


print(f"Intercept: {model.intercept_}")
print(f"Coefficient: {model.coef_}")

Implementation of Random Forest Model


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the data (replace 'your_data.csv' with your actual file)


data = pd.read_csv('your_data.csv')

# Separate features (X) and target (y)


X = data.drop('target_column_name', axis=1) # Replace 'target_column_name'
y = data['target_column_name']

# Split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier model


model = RandomForestClassifier(n_estimators=100, random_state=42)
# n_estimators is the number of trees in the forest

# Train the model


model.fit(X_train, y_train)

# Make predictions on the test set


y_pred = model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

You might also like