DST Python Code With Explanation
DST Python Code With Explanation
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import seaborn as sns
from sklearn.cluster import KMeans
# Read the excel data using pandas
raw_data = pd.read_excel("C:/Users/Bobby/Documents/Food and nutrition.xlsx")
# Print the first few rows of the data
print(raw_data.head())
# Print statistical summary of the data
print(raw_data.describe())
# Print information about the data
print(raw_data.info)
# Extract column names and first 100 rows
X = list(raw_data.head(76).columns)
y = tuple(raw_data.head(100).iterrows())
# Print column names and a subset of rows
print(X)
print(y[1:10])
# Convert data into a dictionary
data = dict(zip(X, y))
print(data.values())
print(data.keys())
# Create an empty list to store dictionaries
food_data = []
# Loop through each row and convert it to a dictionary
for row in y:
row_data = row.split("\t") # Assuming the data is tab-separated
food_dict = dict(zip(columns, row_data
# Convert values to a list where applicable
for key in food_dict:
if isinstance(food_dict[key], str) and key != "name" and key != "serving_size":
food_dict[key] = [food_dict[key]]
elif isinstance(food_dict[key], int):
food_dict[key] = [food_dict[key]]
# Append the dictionary to the list
food_data.append(food_dict)
# Now, 'food_data' is a list containing dictionaries for all rows
# Create an empty list to store dictionaries
list_of_dicts = []
for x in range(0, len(X)):
row_dict = raw_data.iloc[x].to_dict()
list_of_dicts.append(row_dict)
# Check for missing values
ms = missing_values = raw_data.isnull()
#raw_data.fillna(raw_data.sum(), inplace=True)
# Subset of columns for analysis
subset_columns = ["name", "serving_size", "calories", "total_fat", "saturated_fat",
"cholesterol", "sodium", "choline",
"folate", "folic_acid", "niacin", "pantothenic_acid", "riboflavin", "thiamin",
"vitamin_a",
"vitamin_a_rae",
"carotene_alpha", "carotene_beta", "cryptoxanthin_beta", "lutein_zeaxanthin",
"lucopene",
"vitamin_b12",
"vitamin_b6", "vitamin_c", "vitamin_d", "vitamin_e",
"tocopherol_alpha", "vitamin_k", "calcium", "copper",
"irom", "magnesium", "manganese", "phosphorous", "potassium",
"selenium", "zink", "protein", "alanine",
"arginine", "aspartic_acid", "cystine", "glutamic_acid", "glycine", "histidine",
"hydroxyproline",
"isoleucine", "leucine", "lysine", "methionine", "phenylalanine", "proline",
"serine", "threonine",
"tryptophan", "tyrosine", "valine", "carbohydrate", "fiber", "sugars", "fructose",
"galactose", "glucose",
"lactose", "maltose", "sucrose", "fat", "saturated_fatty_acids",
"monounsaturated_fatty_acids",
"polyunsaturated_fatty_acids", "fatty_acids_total_trans", "alcohol", "ash",
"caffeine", "theobromine",
"water"]
# Example: Create a scatter plot between two numeric columns
plt.scatter(raw_data['serving_size'], raw_data['calories'], cmap='inferno', linewidths=0.56,
c='g')
plt.colorbar()
plt.xlabel('Serving Size')
plt.ylabel('Calories')
plt.title('Scatter Plot for Bivariate Analysis')
plt.show()
# Calculate the correlation matrix
raw_data1 = raw_data.drop(columns='name')
correlation_matrix = raw_data1.corr()
filled_values = raw_data1.dropna()
# Multivariate Analysis
numerical_data = raw_data.drop(columns=['name'])
# Perform PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(filled_values)
# Scatter plot of PCA result
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.6, linewidths=(3, 4), cmap='gray',
c='gray')
plt.colorbar()
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA Result: 2D Scatterplot')
plt.grid(True)
plt.show()
# Check if the columns exist in the DataFrame
# missing_columns = [col for col in subset_columns if col not in raw_data.columns]
# if not missing_columns:
# Your pair plot code here
# sns.pairplot(raw_data[subset_columns])
# plt.show()
# else:
# print("Columns not found in DataFrame:", missing_columns)
# print("Available columns:", raw_data.columns)
# print("Column data types:", raw_data.dtypes)
print(filled_values)
# Perform K-Means clustering
n_clusters = KMeans(n_clusters=50, n_init="auto", algorithm='lloyd', max_iter=1000)
no_clusters = n_clusters.fit_transform(filled_values)
# Scatter plot of K-Means clusters
plt.figure(figsize=(5, 8))
plt.subplot(1, 1, 1)
plt.scatter(no_clusters[:, 0], no_clusters[:, 1], linewidths=(2, 3), c='b', cmap='inferno')
plt.colorbar()
plt.grid(True)
plt.show()
# Performing Bivariate analysis on the data using histogram
sns.histplot(filled_values, x='serving_size', y='calories', stat="count", binwidth=0.56,
bins="auto", element='step',
palette='colorblind', binrange=(100, 200))
plt.show()
1)pandas:
2)matplotlib.pyplot:
Description: Matplotlib is a data visualization library for creating static,
animated, or interactive plots in Python. pyplot is a collection of functions
that provide a simple interface for creating various types of plots.
3) sklearn.decomposition.PCA:
4)sklearn.svm.SVC:
Description: This is also part of scikit-learn and stands for Support Vector
Classification. It is used for classification tasks using Support Vector
Machines (SVM).
5)seaborn:
6)sklearn.cluster.KMeans:
Description: Another part of scikit-learn, KMeans is an unsupervised
machine learning algorithm used for clustering data into groups based on
similarity.
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import seaborn as sns
from sklearn.cluster import KMeans
c)Extract and print the first few rows and statistical summary of the data:
print(raw_data.head())
print(raw_data.describe())
ms = missing_values = raw_data.isnull()
raw_data1 = raw_data.drop(columns='name')
correlation_matrix = raw_data1.corr()
filled_values = raw_data1.dropna()
pca = PCA(n_components=3)
pca_result = pca.fit_transform(filled_values)
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.6, linewidths=(3, 4),
cmap='gray', c='gray')
plt.colorbar()
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA Result: 2D Scatterplot')
plt.grid(True)
plt.show()
plt.figure(figsize=(5, 8))
plt.subplot(1, 1, 1)
plt.scatter(no_clusters[:, 0], no_clusters[:, 1], linewidths=(2, 3), c='b',
cmap='inferno')
plt.colorbar()
plt.grid(True)
plt.show()