0% found this document useful (0 votes)
903 views

Program

The document contains multiple Python scripts that perform data analysis and visualization using libraries such as pandas, numpy, and matplotlib. Key tasks include generating random student scores, analyzing heart disease data with logistic regression, examining cancer-related deaths over time, and evaluating employee details. Additionally, it covers statistical tests, sales data analysis, and visualizations like correlation matrices and trend plots.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
903 views

Program

The document contains multiple Python scripts that perform data analysis and visualization using libraries such as pandas, numpy, and matplotlib. Key tasks include generating random student scores, analyzing heart disease data with logistic regression, examining cancer-related deaths over time, and evaluating employee details. Additionally, it covers statistical tests, sales data analysis, and visualizations like correlation matrices and trend plots.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 10

1.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = {
"Maths": np.random.randint(50, 100, 50),
"Science": np.random.randint(50, 100, 50),
"History": np.random.randint(50, 100, 50),
"English": np.random.randint(50, 100, 50),
"Geography": np.random.randint(50, 100, 50),
}
df= pd.DataFrame(data)
csv_path='student scores.csv'
df.to_csv(csv_path, index=False)

data_array = np.loadtxt(csv_path,delimiter=',',skiprows=1)

correlation_matrix = np.corrcoef(data_array, rowvar=False)


subjects = ["Maths", "Science", "History", "English", "Geography"]
plt.figure(figsize=(8, 6))
sns.heatmap(
correlation_matrix,
annot=True,
xticklabels=subjects,
yticklabels=subjects,
cmap="coolwarm",
cbar=True,
linewidths=0.5,
fmt=".2f",
)
plt.title("Correlation Matrix Heatmap")
plt.xlabel("Subjects")
plt.ylabel("Subjects")
plt.show()
print("Correlation Matrix:" )
print(correlation_matrix)
print("\nlnterpretation:")
print("l. Diagonal values are 1.00, indicating perfect correlation with itself." )
print("2. Most off-diagonal values are close to zero, indicating weak relationships." )
print("3. Slight positive correlations exist between Maths and English or History." )
print("4. Negative correlation exists between English and Geography.")

2.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

file_path="Heart_Disease_UCI.csv"
data = pd.read_csv('Heart_Disease_UCI.xls')

print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
print(data.info())
df=pd.DataFrame(data)
print("\nMissing Values:\n",df.isnull().sum())
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler= StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:{ recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Disease", "Disease"],
yticklabels=["No Disease", "Disease"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("Actual value")
plt.show()

3.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
import seaborn as sns
file_path = "Cancer.csv"
data=pd.read_csv('cancer.xls')
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
print(data.info())
df=pd.DataFrame(data)
print("\nmissing Values:")
print(data.isnull().sum())
plt.figure(figsize=(12,6))
sns.lineplot(x="Year",y="CancerRelatedDeaths",data=data,label="Cancer-Related Deaths",marker="o")
plt.title("Trend of Cancer-Realted Deaths Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Deaths")
plt.legend()
plt.grid()
plt.show()
X=data[["Year","Population","HealthExpenditure"]]
y=data["CancerRelatedDeaths"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print("\nModel Performance:")
print(f"Mean Squared Error(MSE):{mse:.2f}")
print(f"R-squared(R^2):{r2:.2f}")
plt.figure(figsize=(12,6))
plt.scatter(data["Year"],data["CancerRelatedDeaths"],color="blue",label="Actual Data",alpha=0.6)
plt.plot(data["Year"],model.predict(data[["Year","Population","HealthExpenditure"]]),color="red", label= "Regression Line")
plt.title("Cancer-Related Deaths and Regression Line")
plt.xlabel("Year")
plt.ylabel("Number of Deaths")
plt.legend()
plt.grid()
plt.show()

4.

import pandas as pd
import numpy as np
data = {
"EmpID": np.arange(1,101),
"EmpName": [f"Employee_{i}" for i in range(1,101)],
"Designation" :np.random.choice(
["Manager","Team Lead","Developer","Analyst","Intern"],size=100
)
}
employee_df = pd.DataFrame(data)
employee_df.to_csv("employee_details.csv", index=False)
print("Dataset saved to 'employee_details.csv'.")
loaded_df= pd.read_csv("employee_details.csv")
sampled_df = loaded_df.sample(frac=0.25, random_state=42)
print("\n25% Random Sample of the Employee Dataset: ")
print(sampled_df)

import numpy as np

from scipy.stats import norm


np.random.seed(42)
sample_heights=np.random.normal(loc=150, scale=10, size=40)
sample_mean = np.mean(sample_heights)
sample_std_dev = np.std(sample_heights, ddof=1)
sample_size = len(sample_heights)
null_hypothesis_mean = 140
significance_level = 0.05
standard_error = sample_std_dev / np.sqrt(sample_size)
z_score = (sample_mean-null_hypothesis_mean) / standard_error
critical_z = norm.ppf(1 - significance_level)

if z_score>critical_z:
decision = "Reject the Null Hypothesis (Ho)"
else:
decision = "Fail to Reject the Null Hypothesis (Ho)"
print("Z-Test Results:")
print(f"Sample Mean: {sample_mean:.2f}")
print(f"Sample Standard Deviation: {sample_std_dev:.2f}")
print(f"Z-Score: {z_score:.2f}")
print(f"CriticaI Z Value: {critical_z:.2f}")
print(f"Decision: {decision} ")

import pandas as pd
import numpy as np
file_path = "monthly_sales.csv"
data = pd.read_csv('monthly_sales.csv')
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:" )
print(data.info())

print("Original Dataset Shape:", data.shape)


mean_sales = np.mean(data)
print("Mean of Monthly Sales:", mean_sales)

sum_sales = np.sum(data)
print("Sum of Monthly Sales:", sum_sales)

product_sales = np.prod(data)
print("Product of Monthly Sales:" , product_sales)

data = pd.DataFrame({'sales': np.arange(60)})


# Convert DataFrame column to NumPy array and reshape
reshaped_sales = data['sales'].values.reshape(60, 1)
print("Reshaped Array (60 rows, 1 column):\n", reshaped_sales)

# Transpose the reshaped array


transposed_sales = reshaped_sales.T
print("Transposed Array:\n", transposed_sales)

import pandas as pd
import numpy as np

# Read dataset
file_path = "monthly_sales.csv"
data = pd.read_csv(file_path)

# Display dataset overview


print("Dataset Overview:")
print(data.head())

print("\nDataset Info:")
print(data.info())

# Ensure numerical operations are performed on relevant columns


print("Original Dataset Shape:", data.shape)

# Assuming 'sales' is the numeric column in the dataset


if 'sales' in data.columns:
mean_sales = data['sales'].mean()
print("Mean of Monthly Sales:", mean_sales)

sum_sales = data['sales'].sum()
print("Sum of Monthly Sales:", sum_sales)

product_sales = data['sales'].prod()
print("Product of Monthly Sales:", product_sales)
else:
print("Column 'sales' not found in the dataset.")

# Create a new DataFrame with 60 values


data = pd.DataFrame({'sales': np.arange(60)})

# Convert DataFrame column to NumPy array and reshape


reshaped_sales = data['sales'].values.reshape(60, 1)
print("Reshaped Array (60 rows, 1 column):\n", reshaped_sales)

# Transpose the reshaped array


transposed_sales = reshaped_sales.T
print("Transposed Array:\n", transposed_sales)
8

import numpy as np
data = np.array([
[10,20,30],
[40,np.nan,60],
[70,80,"invalid"],
[90,100,110],
[120,np.nan,150]
], dtype=object)
print("Original Array:")
print(data)
def replace_nan_with_mean(array):
numeric_array =array.astype("float",copy=False)
for col in range(numberic_array.shape[1]):
col_values=numberic_array[:,col]
if np.isnan(col_values).any():
col_means=np.nanmean(col_values)
col_values[np.isnan(col_values)]=col_mean
return numeric_array
try:
numeric_data = replace_nan_with_mean(data.astype("float"))
print("\nArray After Replacing NaN with Column Averages:")
print(numeric_data)
except ValueError:
print("\nArray contains non-numberic values. NaN replacement skipped.")

def remove_non_numeric_rows(array):
numberic_mask=np.array([all(isinstance(x,(int,float)) and not np.isnan(x) for x in row) for row in array])
return array[numberic_mask]
cleaned_data=remove_non_numeric_rows(data)
print("\nArray After Removing Rows with Non-Numeric values:")
print(cleaned_data)

def contains_row(array,row):
return any((array==row).all(axis=1))
row_to_check = [90,100,110]
is_present=contains_row(cleaned_data.astype("int"),row_to_check)
print(f"\n Does the array contain the row {row_to_check}?{'Yes' if is_present else 'No'}")

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

# Load the CSV file

file_path = "student_marks.csv"

scores = pd.read_csv(file_path)
# Display the dataset overview and info

print("Dataset Overview:")

print(scores.head())

print("\nDataset Info:")

print(scores.info())

# 1. Average score for each subject (column-wise average)

subject_avg_scores = scores.mean(axis=0)

print("\nAverage Score for Each Subject Across All Students (by Subject):")

print(subject_avg_scores)

# 2. Average score per student (row-wise average)

student_avg_scores = scores.mean(axis=1)

highest_avg_student = student_avg_scores.idxmax()

lowest_avg_student = student_avg_scores.idxmin()

print("\nStudent with Highest Average Score (Index):", highest_avg_student)

print("Student with Lowest Average Score (Index):", lowest_avg_student)

# 3. Pass rate per subject (score >= 60 considered pass)

pass_rate = (scores >= 60).mean(axis=0)

print("\nPass Rate for Each Subject:")

print(pass_rate)

# 4. Correlation matrix between subjects

correlation_matrix = scores.corr()

print("\nCorrelation Matrix Between Subjects:")

print(correlation_matrix)

# 5. Overall average score for each semester (assuming each subject is a semester)

semester_avg_scores = scores.mean(axis=0)

print("\nOverall Average Score for Each Semester:")

print(semester_avg_scores)
# 6. Plotting the average scores

x_values = range(1, len(semester_avg_scores) + 1)

plt.figure(figsize=(8, 5))

plt.plot(x_values, semester_avg_scores, marker="o", linestyle="-", color="blue")

plt.title("Overall Average Scores Across Semesters")

plt.xlabel("Semester")

plt.ylabel("Average Score")

plt.grid(True)

plt.show()

# 7. Standard deviation of scores per subject

subject_std_dev = scores.std(axis=0)

print("\nStandard Deviation of Scores for Each Subject:")

print(subject_std_dev)

10

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

# Load the data

file_path = "retail_store.csv"

store = pd.read_csv(file_path)

df = pd.DataFrame(store)

# Calculate revenue

df["Revenue"] = df["Quantity"] * df["Unit_Price"]

print("Sample Dataset:")

print(df.head())
# Total revenue

total_revenue = df["Revenue"].sum()

print(f"\nTotal revenue Generated: ${total_revenue:.2f}")

# Product with highest sales revenue

product_revenue = df.groupby("Product_ID")["Revenue"].sum()

highest_revenue_product = product_revenue.idxmax()

print(f"\nProduct with Highest Sales Revenue: Product_ID {highest_revenue_product}")

# Average quantity sold

average_quantity = df["Quantity"].mean()

print(f"\nAverage Quantity Sold per Transaction: {average_quantity:.2f}")

# Create a Month column for grouping

df["Month"] = pd.to_datetime(df["Date"]).dt.to_period("M")

# Monthly sales trend

monthly_sales = df.groupby("Month")["Revenue"].sum()

plt.figure(figsize=(10, 6))

monthly_sales.plot(kind="line", marker="o", color="blue")

plt.title("Monthly Sales Trend")

plt.xlabel("Month")

plt.ylabel("Total Revenue")

plt.grid()

plt.show()

# Correlation matrix

correlation = df[["Quantity", "Unit_Price"]].corr()

print("Correlation Matrix:")

print(correlation)

# Scatter plot

plt.figure(figsize=(8, 6))

sns.scatterplot(x="Unit_Price", y="Quantity", data=df, alpha=0.7)


plt.title("Correlation Between Quantity and Unit Price")

plt.xlabel("Unit Price")

plt.ylabel("Quantity Sold")

plt.grid()

plt.show()

You might also like