Program
Program
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = {
"Maths": np.random.randint(50, 100, 50),
"Science": np.random.randint(50, 100, 50),
"History": np.random.randint(50, 100, 50),
"English": np.random.randint(50, 100, 50),
"Geography": np.random.randint(50, 100, 50),
}
df= pd.DataFrame(data)
csv_path='student scores.csv'
df.to_csv(csv_path, index=False)
data_array = np.loadtxt(csv_path,delimiter=',',skiprows=1)
2.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
file_path="Heart_Disease_UCI.csv"
data = pd.read_csv('Heart_Disease_UCI.xls')
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
print(data.info())
df=pd.DataFrame(data)
print("\nMissing Values:\n",df.isnull().sum())
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler= StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:{ recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Disease", "Disease"],
yticklabels=["No Disease", "Disease"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("Actual value")
plt.show()
3.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
import seaborn as sns
file_path = "Cancer.csv"
data=pd.read_csv('cancer.xls')
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
print(data.info())
df=pd.DataFrame(data)
print("\nmissing Values:")
print(data.isnull().sum())
plt.figure(figsize=(12,6))
sns.lineplot(x="Year",y="CancerRelatedDeaths",data=data,label="Cancer-Related Deaths",marker="o")
plt.title("Trend of Cancer-Realted Deaths Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Deaths")
plt.legend()
plt.grid()
plt.show()
X=data[["Year","Population","HealthExpenditure"]]
y=data["CancerRelatedDeaths"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print("\nModel Performance:")
print(f"Mean Squared Error(MSE):{mse:.2f}")
print(f"R-squared(R^2):{r2:.2f}")
plt.figure(figsize=(12,6))
plt.scatter(data["Year"],data["CancerRelatedDeaths"],color="blue",label="Actual Data",alpha=0.6)
plt.plot(data["Year"],model.predict(data[["Year","Population","HealthExpenditure"]]),color="red", label= "Regression Line")
plt.title("Cancer-Related Deaths and Regression Line")
plt.xlabel("Year")
plt.ylabel("Number of Deaths")
plt.legend()
plt.grid()
plt.show()
4.
import pandas as pd
import numpy as np
data = {
"EmpID": np.arange(1,101),
"EmpName": [f"Employee_{i}" for i in range(1,101)],
"Designation" :np.random.choice(
["Manager","Team Lead","Developer","Analyst","Intern"],size=100
)
}
employee_df = pd.DataFrame(data)
employee_df.to_csv("employee_details.csv", index=False)
print("Dataset saved to 'employee_details.csv'.")
loaded_df= pd.read_csv("employee_details.csv")
sampled_df = loaded_df.sample(frac=0.25, random_state=42)
print("\n25% Random Sample of the Employee Dataset: ")
print(sampled_df)
import numpy as np
if z_score>critical_z:
decision = "Reject the Null Hypothesis (Ho)"
else:
decision = "Fail to Reject the Null Hypothesis (Ho)"
print("Z-Test Results:")
print(f"Sample Mean: {sample_mean:.2f}")
print(f"Sample Standard Deviation: {sample_std_dev:.2f}")
print(f"Z-Score: {z_score:.2f}")
print(f"CriticaI Z Value: {critical_z:.2f}")
print(f"Decision: {decision} ")
import pandas as pd
import numpy as np
file_path = "monthly_sales.csv"
data = pd.read_csv('monthly_sales.csv')
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:" )
print(data.info())
sum_sales = np.sum(data)
print("Sum of Monthly Sales:", sum_sales)
product_sales = np.prod(data)
print("Product of Monthly Sales:" , product_sales)
import pandas as pd
import numpy as np
# Read dataset
file_path = "monthly_sales.csv"
data = pd.read_csv(file_path)
print("\nDataset Info:")
print(data.info())
sum_sales = data['sales'].sum()
print("Sum of Monthly Sales:", sum_sales)
product_sales = data['sales'].prod()
print("Product of Monthly Sales:", product_sales)
else:
print("Column 'sales' not found in the dataset.")
import numpy as np
data = np.array([
[10,20,30],
[40,np.nan,60],
[70,80,"invalid"],
[90,100,110],
[120,np.nan,150]
], dtype=object)
print("Original Array:")
print(data)
def replace_nan_with_mean(array):
numeric_array =array.astype("float",copy=False)
for col in range(numberic_array.shape[1]):
col_values=numberic_array[:,col]
if np.isnan(col_values).any():
col_means=np.nanmean(col_values)
col_values[np.isnan(col_values)]=col_mean
return numeric_array
try:
numeric_data = replace_nan_with_mean(data.astype("float"))
print("\nArray After Replacing NaN with Column Averages:")
print(numeric_data)
except ValueError:
print("\nArray contains non-numberic values. NaN replacement skipped.")
def remove_non_numeric_rows(array):
numberic_mask=np.array([all(isinstance(x,(int,float)) and not np.isnan(x) for x in row) for row in array])
return array[numberic_mask]
cleaned_data=remove_non_numeric_rows(data)
print("\nArray After Removing Rows with Non-Numeric values:")
print(cleaned_data)
def contains_row(array,row):
return any((array==row).all(axis=1))
row_to_check = [90,100,110]
is_present=contains_row(cleaned_data.astype("int"),row_to_check)
print(f"\n Does the array contain the row {row_to_check}?{'Yes' if is_present else 'No'}")
import numpy as np
import pandas as pd
file_path = "student_marks.csv"
scores = pd.read_csv(file_path)
# Display the dataset overview and info
print("Dataset Overview:")
print(scores.head())
print("\nDataset Info:")
print(scores.info())
subject_avg_scores = scores.mean(axis=0)
print("\nAverage Score for Each Subject Across All Students (by Subject):")
print(subject_avg_scores)
student_avg_scores = scores.mean(axis=1)
highest_avg_student = student_avg_scores.idxmax()
lowest_avg_student = student_avg_scores.idxmin()
print(pass_rate)
correlation_matrix = scores.corr()
print(correlation_matrix)
# 5. Overall average score for each semester (assuming each subject is a semester)
semester_avg_scores = scores.mean(axis=0)
print(semester_avg_scores)
# 6. Plotting the average scores
plt.figure(figsize=(8, 5))
plt.xlabel("Semester")
plt.ylabel("Average Score")
plt.grid(True)
plt.show()
subject_std_dev = scores.std(axis=0)
print(subject_std_dev)
10
import pandas as pd
import numpy as np
file_path = "retail_store.csv"
store = pd.read_csv(file_path)
df = pd.DataFrame(store)
# Calculate revenue
print("Sample Dataset:")
print(df.head())
# Total revenue
total_revenue = df["Revenue"].sum()
product_revenue = df.groupby("Product_ID")["Revenue"].sum()
highest_revenue_product = product_revenue.idxmax()
average_quantity = df["Quantity"].mean()
df["Month"] = pd.to_datetime(df["Date"]).dt.to_period("M")
monthly_sales = df.groupby("Month")["Revenue"].sum()
plt.figure(figsize=(10, 6))
plt.xlabel("Month")
plt.ylabel("Total Revenue")
plt.grid()
plt.show()
# Correlation matrix
print("Correlation Matrix:")
print(correlation)
# Scatter plot
plt.figure(figsize=(8, 6))
plt.xlabel("Unit Price")
plt.ylabel("Quantity Sold")
plt.grid()
plt.show()