0% found this document useful (0 votes)
18 views21 pages

Data Science and Analtics Laboratory

The document outlines a series of programming exercises focused on data analysis and statistical methods using Python and libraries such as Pandas, Matplotlib, and Scipy. It covers topics including data manipulation with Pandas, basic plotting, statistical tests (Z-test, T-test, ANOVA), regression analysis, and time series analysis. Each exercise includes code examples and explanations for performing specific tasks related to data analysis.

Uploaded by

NAGULAN 11 A1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views21 pages

Data Science and Analtics Laboratory

The document outlines a series of programming exercises focused on data analysis and statistical methods using Python and libraries such as Pandas, Matplotlib, and Scipy. It covers topics including data manipulation with Pandas, basic plotting, statistical tests (Z-test, T-test, ANOVA), regression analysis, and time series analysis. Each exercise includes code examples and explanations for performing specific tasks related to data analysis.

Uploaded by

NAGULAN 11 A1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 21

Page.

S. No Date List Of Programs Marks Signature


No
Tools: Python, Numpy, Scipy, Matplotib,
Pandas, Statmodels, Seaborn, Plotly, Bokeh,
working with Numpy arrays

1 Working with Pandas data frame

2 Basic Plots using Matplotlib

3 Frequency distributors, Averages, Variability

Normal Curves, Correlation and scatter


4 plots, Correlation coefficient

5 Regression

6 Z-test

7 T-test

8 Anova

Building and validating linear models, logistic


9
models

10 Time series analysis

LIST OF PROGRAMS
EXP-1 Working with panda’s data frame
import pandas as pd
# Create a DataFrame from a dictionary
data = {'Name': ['Alice','Bob','Charlie'],
'Age': [25,30,22],
'City': ['New York','San Francisco','Los Angeles']}
df = pd.DataFrame(data)
# Display the DataFrame
print("Original DataFrame:")
print(df)
# Add a new column
df['Occupation'] = ['Engineer','Designer','Actor']
# Filter rows based on a condition
young_people = df[df['Age'] < 30]
# Display the updated DataFrame
print("\nDataFrame with New Column:")
print(df)
print("\nYoung People:")
print(young_people)

Eg-2:
# Accessing and modifying elements
print("\nAccessing and Modifying Elements:")
print(df['Name']) # Access a specific column
df.loc[1,'Age'] = 31 # Modify a specific element using index and column label
df.at[2,'City'] = 'Hollywood' # Modify a specific element using row and column labels
print(df)
# Basic statistics
print("\nBasic Statistics:")
print("Mean Age:", df['Age'].mean())
print("Maximum Age:", df['Age'].max())
# Sorting
print("\nSorting:")
df_sorted = df.sort_values(by='Age', ascending=False)# Sort DataFrame
by a column
print(df_sorted)
# Grouping and Aggregation
print("\nGrouping and Aggregation:")
city_group = df.groupby('City')
city_stats = city_group.agg({'Age': ['mean','min','max']})
print(city_stats)
# Concatenating DataFrames
df2 = pd.DataFrame({'Name': ['David','Eva'], 'Age': [28,26], 'City':
['Chicago','Miami']})
concatenated_df = pd.concat([df, df2], ignore_index=True)
print("\nConcatenated DataFrame:")
print(concatenated_df)
# Merging DataFrames
df3 = pd.DataFrame({'Name': ['Alice','Bob','Charlie'], 'Occupation':
['Engineer','Designer','Actor']})
merged_df = pd.merge(df, df3, on='Name', how='left')
print("\nMerged DataFrame:")
print(merged_df)

OUTPUT:

EXP-2 Basic plots using Matplotlib


import pandas as pd
import matplotlib.pyplot as plt
# Create a DataFrame
data = {'Month': ['Jan','Feb','Mar','Apr','May'],
'Sales': [150,200,180,220,250]}
df = pd.DataFrame(data)
# Line plot
plt.figure(figsize=(8,5))
plt.plot(df['Month'], df['Sales'], marker='o', linestyle='-',
color='b')
plt.title('Monthly Sales')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
# Bar plot
plt.figure(figsize=(8,5))
plt.bar(df['Month'], df['Sales'], color='green')
plt.title('Monthly Sales')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.show()
# Scatter plot
# Creating another DataFrame for demonstration
data_scatter = {'Month': ['Jan','Feb','Mar','Apr','May'],
'Expenses': [30,25,35,40,28]}
df_expenses = pd.DataFrame(data_scatter)
plt.figure(figsize=(8,5))
plt.scatter(df['Sales'], df_expenses['Expenses'], color='red', marker='o')
plt.title('Sales vs Expenses')
plt.xlabel('Sales')
plt.ylabel('Expenses')
plt.grid(True)
plt.show()

OUTPUT:
EXP-3 Frequency distributions, Averages and Variability
Eg 1: Frequency distributions
import matplotlib.pyplot as plt
import pandas as pd
# Sample data
data = [1,2,2,3,3,3,4,4,4,4,5,5,5,5,5]
# Create a pandas Series
series = pd.Series(data)
# Calculate frequency distribution using value_counts
frequency_distribution = series.value_counts().sort_index()
# Display frequency distribution
print("Frequency Distribution:")
print(frequency_distribution)
# Plotting the histogram
plt.bar(frequency_distribution.index, frequency_distribution.values,
color='blue')
plt.title('Frequency Distribution')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

OUTPUT:

Eg 2a: Averages
# Using built-in functions
numbers = [10,20,30,40,50]
average = sum(numbers) / len(numbers)
print("Average:", average)
OUTPUT:

Eg 2b: Averages
import numpy as np
# Using NumPy's mean function
numbers = np.array([10,20,30,40,50])
average = np.mean(numbers)
print("Average:", average)
OUTPUT:

Eg 3: Variability
import numpy as np
data = np.array([10,12,15,18,22,25,30,35,40])
# Range (the difference between the maximum and minimum values)
range_value = np.max(data) - np.min(data)
print(f"Range: {range_value}")
# Variance
variance_value = np.var(data)
print(f"Variance: {variance_value}")
# Standard Deviation
std_deviation_value = np.std(data)
print(f"Standard Deviation: {std_deviation_value}")
OUTPUT:
EXP-4 Normal Curves, Correlations and Correlation Coefficient
Eg 1: Normal Curves
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
data = np.random.normal(size=1000, loc=0, scale=1)
# Plot the histogram
plt.hist(data, bins=30, density=True, alpha=0.6, color='g')
# Fit a normal distribution to the data
mu, std = norm.fit(data)
# Plot the PDF (Probability Density Function) of the fitted
distribution
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
# Customize the plot
plt.title("Fit results: mu = %.2f, std = %.2f" % (mu, std))
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
Eg 2: Correlation and Correlation Coefficient
import pandas as pd
import seaborn as sns
data = {'Variable1': [1,2,3,4,5],
'Variable2': [5,4,3,2,1]}
df = pd.DataFrame(data)
# Calculate correlation matrix
correlation_matrix = df.corr()
# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)
# Plot a heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
# Get the correlation coefficient between two variables
correlation_coefficient = df['Variable1'].corr(df['Variable2'])
print("\nCorrelation Coefficient between Variable1 and Variable2:",
correlation_coefficient)

OUTPUT:
EXP-5 Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
np.random.seed(42)
X = 2* np.random.rand(100,1)
y = 4+3* X + np.random.randn(100,1)
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Plot the training data and the linear regression line
plt.scatter(X_train, y_train, color='blue', label='Training Data')
plt.scatter(X_test, y_test, color='red', label='Testing Data')
plt.plot(X_test, y_pred, color='green', linewidth=3, label='Linear
Regression Line')
plt.title('Linear Regression Example')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

OUTPUT:
EXP-6 Z-Test
import numpy as np
import statistics
import math
np.random.seed(42)
sample_data = np.random.normal(loc=28, scale=5, size=10)# Normal
distribution with mean 28
# Specify the null hypothesis mean (population mean under the null
hypothesis)
null_mean = 30
sample_mean = statistics.mean(sample_data)
sample_stddev = statistics.stdev(sample_data)
# Calculate the Z-statistic
z_statistic = (sample_mean - null_mean) / (sample_stddev /
math.sqrt(len(sample_data)))
# Calculate the p-value for a two-tailed test
p_value = 2* (1-0.5 * (1+ math.erf(abs(z_statistic) /
math.sqrt(2))))
# Display the results
print("Generated Data:", sample_data)
print("\nZ-statistic:", z_statistic)
print("P-value:", p_value)
# Compare the p-value to a significance level (e.g., 0.05) to make a
decision
alpha = 0.05
if p_value < alpha:
print("\nReject the null hypothesis")
else:
print("\nFail to reject the null hypothesis")

EXP-7 T-Test
Eg 1: One sample t-test
import numpy as np
from scipy import stats
sample_data = np.array([25,27,30,29,28,31,26,27,29,30])
# Specify the null hypothesis mean (population mean under the null
hypothesis)
null_mean = 28
# Perform one-sample t-test
t_statistic, p_value = stats.ttest_1samp(sample_data, null_mean)
# Display the results
print("One-Sample T-statistic:", t_statistic)
print("P-value:", p_value)
# Compare the p-value to a significance level (e.g., 0.05) to make a
decision
alpha = 0.05
if p_value < alpha:
print("Reject the null hypothesis")
else:
print("Fail to reject the null hypothesis")

Eg 2: Two Sample t test


# Example data for two groups
group1_data = np.array([25,27,30,29,28])
group2_data = np.array([31,26,27,29,30])
# Perform two-sample t-test
t_statistic, p_value = stats.ttest_ind(group1_data, group2_data)
# Display the results
print("\nTwo-Sample T-statistic:", t_statistic)
print("P-value:", p_value)
# Compare the p-value to a significance level (e.g., 0.05) to make a
decision
alpha = 0.05
if p_value < alpha:
print("Reject the null hypothesis")
else:
print("Fail to reject the null hypothesis")
OUTPUT:

EXP-8 ANOVA Test


import numpy as np
from scipy.stats import f_oneway
np.random.seed(42)
group1 = np.random.normal(30,5,50)# Mean=30, Standard Deviation=5
group2 = np.random.normal(35,5,50)# Mean=35, Standard Deviation=5
group3 = np.random.normal(40,5,50)# Mean=40, Standard Deviation=5
# Perform one-way ANOVA
f_statistic, p_value = f_oneway(group1, group2, group3)
# Display the results
print("F-statistic:", f_statistic)
print("P-value:", p_value)
# Compare the p-value to a significance level (e.g., 0.05) to make a
decision
alpha = 0.05
if p_value < alpha:
print("\nReject the null hypothesis (there are significant
differences among group means)")
else:
print("\nFail to reject the null hypothesis (no significant
differences among group means)")

EXP-9 Building and Validating Linear Models


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Load the California Housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data,
columns=california_housing.feature_names)
data['PRICE'] = california_housing.target
# Use a single feature for simplicity, let's say 'MedInc' (median
income)
X = data[['MedInc']]
y = data['PRICE']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Build a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')
# Visualize the linear regression line
plt.scatter(X_test, y_test, color='black', label='Actual')
plt.plot(X_test, y_pred, color='blue', linewidth=3, label='Predicted')
plt.xlabel('Median Income (MedInc)')
plt.ylabel('Housing Price')
plt.legend()
plt.show()
OUTPUT:
EXP-10 BUILDING AND VALIDATING LOGISTIC MODELS

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,
classification_report
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
# Load the Iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['Target'] = iris.target
# For simplicity, let's consider only two classes (binary
classification)
# We'll combine class 1 and class 2 as one class and leave class 0 as
another class
data_binary = data[data['Target'] != 0]
X = data_binary.iloc[:, :2]# Use only the first two features for
simplicity
y = data_binary['Target']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Build a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('\nConfusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_rep)
# Visualize the decision boundary for two features
plt.figure(figsize=(8,6))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap='viridis',
edgecolors='k', s=50)
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
# Plot decision boundary
h = .02
x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min,
y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap='viridis', alpha=0.3)
plt.title('Logistic Regression Decision Boundary')
plt.show()
OUTPUT:

EXP-11 Time Series Analysis


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
np.random.seed(42)
date_rng = pd.date_range(start='2022-01-01', end='2022-12-31',
freq='D')
ts = pd.Series(np.random.randn(len(date_rng)), index=date_rng)
# Visualize the time series
ts.plot(figsize=(10,6))
plt.title('Sample Time Series')
plt.show()
# Decompose the time series into trend, seasonal, and residual
components
result = seasonal_decompose(ts, model='additive', period=30)
result.plot()
plt.show()
# Perform Augmented Dickey-Fuller Test for stationarity
adf_result = adfuller(ts)
print(f'ADF Statistic: {adf_result[0]}')
print(f'p-value: {adf_result[1]}')
print('Critical Values:')
for key, value in adf_result[4].items():
print(f'{key}:{value}')
# Plot Autocorrelation Function (ACF) and Partial Autocorrelation
Function (PACF)
plot_acf(ts, lags=30)
plt.title('Autocorrelation Function (ACF)')
plt.show()
plot_pacf(ts, lags=30)
plt.title('Partial Autocorrelation Function (PACF)')
plt.show()

OUTPUT:

You might also like