Data Science and Analtics Laboratory
Data Science and Analtics Laboratory
5 Regression
6 Z-test
7 T-test
8 Anova
LIST OF PROGRAMS
EXP-1 Working with panda’s data frame
import pandas as pd
# Create a DataFrame from a dictionary
data = {'Name': ['Alice','Bob','Charlie'],
'Age': [25,30,22],
'City': ['New York','San Francisco','Los Angeles']}
df = pd.DataFrame(data)
# Display the DataFrame
print("Original DataFrame:")
print(df)
# Add a new column
df['Occupation'] = ['Engineer','Designer','Actor']
# Filter rows based on a condition
young_people = df[df['Age'] < 30]
# Display the updated DataFrame
print("\nDataFrame with New Column:")
print(df)
print("\nYoung People:")
print(young_people)
Eg-2:
# Accessing and modifying elements
print("\nAccessing and Modifying Elements:")
print(df['Name']) # Access a specific column
df.loc[1,'Age'] = 31 # Modify a specific element using index and column label
df.at[2,'City'] = 'Hollywood' # Modify a specific element using row and column labels
print(df)
# Basic statistics
print("\nBasic Statistics:")
print("Mean Age:", df['Age'].mean())
print("Maximum Age:", df['Age'].max())
# Sorting
print("\nSorting:")
df_sorted = df.sort_values(by='Age', ascending=False)# Sort DataFrame
by a column
print(df_sorted)
# Grouping and Aggregation
print("\nGrouping and Aggregation:")
city_group = df.groupby('City')
city_stats = city_group.agg({'Age': ['mean','min','max']})
print(city_stats)
# Concatenating DataFrames
df2 = pd.DataFrame({'Name': ['David','Eva'], 'Age': [28,26], 'City':
['Chicago','Miami']})
concatenated_df = pd.concat([df, df2], ignore_index=True)
print("\nConcatenated DataFrame:")
print(concatenated_df)
# Merging DataFrames
df3 = pd.DataFrame({'Name': ['Alice','Bob','Charlie'], 'Occupation':
['Engineer','Designer','Actor']})
merged_df = pd.merge(df, df3, on='Name', how='left')
print("\nMerged DataFrame:")
print(merged_df)
OUTPUT:
OUTPUT:
EXP-3 Frequency distributions, Averages and Variability
Eg 1: Frequency distributions
import matplotlib.pyplot as plt
import pandas as pd
# Sample data
data = [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5]
# Create a pandas Series
series = pd.Series(data)
# Calculate frequency distribution using value_counts
frequency_distribution = series.value_counts().sort_index()
# Display frequency distribution
print("Frequency Distribution:")
print(frequency_distribution)
# Plotting the histogram
plt.bar(frequency_distribution.index, frequency_distribution.values,
color='blue')
plt.title('Frequency Distribution')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()
OUTPUT:
Eg 2a: Averages
# Using built-in functions
numbers = [10,20,30,40,50]
average = sum(numbers) / len(numbers)
print("Average:", average)
OUTPUT:
Eg 2b: Averages
import numpy as np
# Using NumPy's mean function
numbers = np.array([10,20,30,40,50])
average = np.mean(numbers)
print("Average:", average)
OUTPUT:
Eg 3: Variability
import numpy as np
data = np.array([10,12,15,18,22,25,30,35,40])
# Range (the difference between the maximum and minimum values)
range_value = np.max(data) - np.min(data)
print(f"Range: {range_value}")
# Variance
variance_value = np.var(data)
print(f"Variance: {variance_value}")
# Standard Deviation
std_deviation_value = np.std(data)
print(f"Standard Deviation: {std_deviation_value}")
OUTPUT:
EXP-4 Normal Curves, Correlations and Correlation Coefficient
Eg 1: Normal Curves
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
data = np.random.normal(size=1000, loc=0, scale=1)
# Plot the histogram
plt.hist(data, bins=30, density=True, alpha=0.6, color='g')
# Fit a normal distribution to the data
mu, std = norm.fit(data)
# Plot the PDF (Probability Density Function) of the fitted
distribution
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
# Customize the plot
plt.title("Fit results: mu = %.2f, std = %.2f" % (mu, std))
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
Eg 2: Correlation and Correlation Coefficient
import pandas as pd
import seaborn as sns
data = {'Variable1': [1,2,3,4,5],
'Variable2': [5,4,3,2,1]}
df = pd.DataFrame(data)
# Calculate correlation matrix
correlation_matrix = df.corr()
# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)
# Plot a heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
# Get the correlation coefficient between two variables
correlation_coefficient = df['Variable1'].corr(df['Variable2'])
print("\nCorrelation Coefficient between Variable1 and Variable2:",
correlation_coefficient)
OUTPUT:
EXP-5 Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
np.random.seed(42)
X = 2* np.random.rand(100,1)
y = 4+3* X + np.random.randn(100,1)
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Plot the training data and the linear regression line
plt.scatter(X_train, y_train, color='blue', label='Training Data')
plt.scatter(X_test, y_test, color='red', label='Testing Data')
plt.plot(X_test, y_pred, color='green', linewidth=3, label='Linear
Regression Line')
plt.title('Linear Regression Example')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
OUTPUT:
EXP-6 Z-Test
import numpy as np
import statistics
import math
np.random.seed(42)
sample_data = np.random.normal(loc=28, scale=5, size=10)# Normal
distribution with mean 28
# Specify the null hypothesis mean (population mean under the null
hypothesis)
null_mean = 30
sample_mean = statistics.mean(sample_data)
sample_stddev = statistics.stdev(sample_data)
# Calculate the Z-statistic
z_statistic = (sample_mean - null_mean) / (sample_stddev /
math.sqrt(len(sample_data)))
# Calculate the p-value for a two-tailed test
p_value = 2* (1-0.5 * (1+ math.erf(abs(z_statistic) /
math.sqrt(2))))
# Display the results
print("Generated Data:", sample_data)
print("\nZ-statistic:", z_statistic)
print("P-value:", p_value)
# Compare the p-value to a significance level (e.g., 0.05) to make a
decision
alpha = 0.05
if p_value < alpha:
print("\nReject the null hypothesis")
else:
print("\nFail to reject the null hypothesis")
EXP-7 T-Test
Eg 1: One sample t-test
import numpy as np
from scipy import stats
sample_data = np.array([25,27,30,29,28,31,26,27,29,30])
# Specify the null hypothesis mean (population mean under the null
hypothesis)
null_mean = 28
# Perform one-sample t-test
t_statistic, p_value = stats.ttest_1samp(sample_data, null_mean)
# Display the results
print("One-Sample T-statistic:", t_statistic)
print("P-value:", p_value)
# Compare the p-value to a significance level (e.g., 0.05) to make a
decision
alpha = 0.05
if p_value < alpha:
print("Reject the null hypothesis")
else:
print("Fail to reject the null hypothesis")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
# Load the Iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['Target'] = iris.target
# For simplicity, let's consider only two classes (binary
classification)
# We'll combine class 1 and class 2 as one class and leave class 0 as
another class
data_binary = data[data['Target'] != 0]
X = data_binary.iloc[:, :2]# Use only the first two features for
simplicity
y = data_binary['Target']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Build a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('\nConfusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_rep)
# Visualize the decision boundary for two features
plt.figure(figsize=(8,6))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap='viridis',
edgecolors='k', s=50)
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
# Plot decision boundary
h = .02
x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min,
y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap='viridis', alpha=0.3)
plt.title('Logistic Regression Decision Boundary')
plt.show()
OUTPUT:
OUTPUT: