0% found this document useful (0 votes)
16 views

DST Python Code With Explanation

Python Code for simple TikTok Game

Uploaded by

Vedant Gade
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views

DST Python Code With Explanation

Python Code for simple TikTok Game

Uploaded by

Vedant Gade
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

# Import necessary libraries

import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import seaborn as sns
from sklearn.cluster import KMeans
# Read the excel data using pandas
raw_data = pd.read_excel("C:/Users/Bobby/Documents/Food and nutrition.xlsx")
# Print the first few rows of the data
print(raw_data.head())
# Print statistical summary of the data
print(raw_data.describe())
# Print information about the data
print(raw_data.info)
# Extract column names and first 100 rows
X = list(raw_data.head(76).columns)
y = tuple(raw_data.head(100).iterrows())
# Print column names and a subset of rows
print(X)
print(y[1:10])
# Convert data into a dictionary
data = dict(zip(X, y))
print(data.values())
print(data.keys())
# Create an empty list to store dictionaries
food_data = []
# Loop through each row and convert it to a dictionary
for row in y:
row_data = row.split("\t") # Assuming the data is tab-separated
food_dict = dict(zip(columns, row_data
# Convert values to a list where applicable
for key in food_dict:
if isinstance(food_dict[key], str) and key != "name" and key != "serving_size":
food_dict[key] = [food_dict[key]]
elif isinstance(food_dict[key], int):
food_dict[key] = [food_dict[key]]
# Append the dictionary to the list
food_data.append(food_dict)
# Now, 'food_data' is a list containing dictionaries for all rows
# Create an empty list to store dictionaries
list_of_dicts = []
for x in range(0, len(X)):
row_dict = raw_data.iloc[x].to_dict()
list_of_dicts.append(row_dict)
# Check for missing values
ms = missing_values = raw_data.isnull()
#raw_data.fillna(raw_data.sum(), inplace=True)
# Subset of columns for analysis
subset_columns = ["name", "serving_size", "calories", "total_fat", "saturated_fat",
"cholesterol", "sodium", "choline",
"folate", "folic_acid", "niacin", "pantothenic_acid", "riboflavin", "thiamin",
"vitamin_a",
"vitamin_a_rae",
"carotene_alpha", "carotene_beta", "cryptoxanthin_beta", "lutein_zeaxanthin",
"lucopene",
"vitamin_b12",
"vitamin_b6", "vitamin_c", "vitamin_d", "vitamin_e",
"tocopherol_alpha", "vitamin_k", "calcium", "copper",
"irom", "magnesium", "manganese", "phosphorous", "potassium",
"selenium", "zink", "protein", "alanine",
"arginine", "aspartic_acid", "cystine", "glutamic_acid", "glycine", "histidine",
"hydroxyproline",
"isoleucine", "leucine", "lysine", "methionine", "phenylalanine", "proline",
"serine", "threonine",
"tryptophan", "tyrosine", "valine", "carbohydrate", "fiber", "sugars", "fructose",
"galactose", "glucose",
"lactose", "maltose", "sucrose", "fat", "saturated_fatty_acids",
"monounsaturated_fatty_acids",
"polyunsaturated_fatty_acids", "fatty_acids_total_trans", "alcohol", "ash",
"caffeine", "theobromine",
"water"]
# Example: Create a scatter plot between two numeric columns
plt.scatter(raw_data['serving_size'], raw_data['calories'], cmap='inferno', linewidths=0.56,
c='g')
plt.colorbar()
plt.xlabel('Serving Size')
plt.ylabel('Calories')
plt.title('Scatter Plot for Bivariate Analysis')
plt.show()
# Calculate the correlation matrix
raw_data1 = raw_data.drop(columns='name')
correlation_matrix = raw_data1.corr()
filled_values = raw_data1.dropna()
# Multivariate Analysis
numerical_data = raw_data.drop(columns=['name'])
# Perform PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(filled_values)
# Scatter plot of PCA result
plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.6, linewidths=(3, 4), cmap='gray',
c='gray')
plt.colorbar()
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA Result: 2D Scatterplot')
plt.grid(True)
plt.show()
# Check if the columns exist in the DataFrame
# missing_columns = [col for col in subset_columns if col not in raw_data.columns]
# if not missing_columns:
# Your pair plot code here
# sns.pairplot(raw_data[subset_columns])
# plt.show()
# else:
# print("Columns not found in DataFrame:", missing_columns)
# print("Available columns:", raw_data.columns)
# print("Column data types:", raw_data.dtypes)
print(filled_values)
# Perform K-Means clustering
n_clusters = KMeans(n_clusters=50, n_init="auto", algorithm='lloyd', max_iter=1000)
no_clusters = n_clusters.fit_transform(filled_values)
# Scatter plot of K-Means clusters
plt.figure(figsize=(5, 8))
plt.subplot(1, 1, 1)
plt.scatter(no_clusters[:, 0], no_clusters[:, 1], linewidths=(2, 3), c='b', cmap='inferno')
plt.colorbar()
plt.grid(True)
plt.show()
# Performing Bivariate analysis on the data using histogram
sns.histplot(filled_values, x='serving_size', y='calories', stat="count", binwidth=0.56,
bins="auto", element='step',
palette='colorblind', binrange=(100, 200))
plt.show()
1)pandas:

Description: Pandas is a popular Python library for data manipulation and


analysis. It provides data structures like dataframes and series for working
with structured data.

2)matplotlib.pyplot:
Description: Matplotlib is a data visualization library for creating static,
animated, or interactive plots in Python. pyplot is a collection of functions
that provide a simple interface for creating various types of plots.

3) sklearn.decomposition.PCA:

Description: This is part of the scikit-learn library (sklearn) and provides


Principal Component Analysis (PCA) for dimensionality reduction and
feature extraction.

4)sklearn.svm.SVC:

Description: This is also part of scikit-learn and stands for Support Vector
Classification. It is used for classification tasks using Support Vector
Machines (SVM).

5)seaborn:

Description: Seaborn is a data visualization library based on Matplotlib. It


provides a high-level interface for creating informative and attractive
statistical graphics.

6)sklearn.cluster.KMeans:
Description: Another part of scikit-learn, KMeans is an unsupervised
machine learning algorithm used for clustering data into groups based on
similarity.

a)Import necessary libraries:

import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import seaborn as sns
from sklearn.cluster import KMeans

b)Read the data from an Excel file using Pandas:

raw_data = pd.read_excel("C:/Users/Bobby/Documents/Food and


nutrition.xlsx")

c)Extract and print the first few rows and statistical summary of the data:

print(raw_data.head())
print(raw_data.describe())

d)Extract column names and a subset of rows:


X = list(raw_data.head(76).columns)
y = tuple(raw_data.head(100).iterrows())
e)Convert data into a dictionary and print values and keys:

data = dict(zip(X, y))


print(data.values())
print(data.keys())

f)Create an empty list to store dictionaries and convert rows to dictionaries:


food_data = []
for row in y:
row_data = row.split("\t")
food_dict = dict(zip(columns, row_data))
# Convert values to lists where applicable
and append the dictionary
food_data.append(food_dict)

g)Create a list of dictionaries and check for missing values:


list_of_dicts = []
for x in range(0, len(X)):
row_dict = raw_data.iloc[x].to_dict()
list_of_dicts.append(row_dict)

ms = missing_values = raw_data.isnull()

h)Define a subset of columns for analysis:

subset_columns = [list of column names]


i)Create a scatter plot between two numeric columns:

plt.scatter(raw_data['serving_size'], raw_data['calories'], cmap='inferno',


linewidths=0.56, c='g')
plt.colorbar()
plt.xlabel('Serving Size')
plt.ylabel('Calories')
plt.title('Scatter Plot for Bivariate Analysis')
plt.show()

j)Calculate the correlation matrix and perform PCA:

raw_data1 = raw_data.drop(columns='name')
correlation_matrix = raw_data1.corr()
filled_values = raw_data1.dropna()

pca = PCA(n_components=3)
pca_result = pca.fit_transform(filled_values)

k)Create a scatter plot of PCA result:

plt.figure(figsize=(8, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.6, linewidths=(3, 4),
cmap='gray', c='gray')
plt.colorbar()
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA Result: 2D Scatterplot')
plt.grid(True)
plt.show()

l)Perform K-Means clustering and create a scatter plot of clusters:

n_clusters = KMeans(n_clusters=50, n_init="auto", algorithm='lloyd',


max_iter=1000)
no_clusters = n_clusters.fit_transform(filled_values)

plt.figure(figsize=(5, 8))
plt.subplot(1, 1, 1)
plt.scatter(no_clusters[:, 0], no_clusters[:, 1], linewidths=(2, 3), c='b',
cmap='inferno')
plt.colorbar()
plt.grid(True)
plt.show()

m)Perform bivariate analysis using a histogram:

sns.histplot(filled_values, x='serving_size', y='calories', stat="count",


binwidth=0.56, bins="auto",
element='step', palette='colorblind', binrange=(100, 200))
plt.show()

You might also like