Assignment 1: Python Fundamentals and Data Structures
1. Custom Data Structures
Singly Linked List:
class Node:
def __init__(self, data):
self.data = data
self.next = None
class LinkedList:
def __init__(self):
self.head = None
def append(self, data):
new_node = Node(data)
if not self.head:
self.head = new_node
return
current = self.head
while current.next:
current = current.next
current.next = new_node
def display(self):
current = self.head
while current:
print(current.data, end=" -> ")
current = current.next
print("None")
Stack:
class Stack:
def __init__(self):
self.stack = []
def push(self, item):
self.stack.append(item)
def pop(self):
if not self.stack:
raise IndexError("Pop from empty stack")
return self.stack.pop()
def peek(self):
return self.stack[-1] if self.stack else None
Queue:
class Queue:
def __init__(self):
self.queue = []
def enqueue(self, item):
self.queue.append(item)
def dequeue(self):
if not self.queue:
raise IndexError("Dequeue from empty queue")
return self.queue.pop(0)
def peek(self):
return self.queue[0] if self.queue else None
2. Statistical Calculations Without Libraries
def mean(data):
return sum(data) / len(data)
def median(data):
data = sorted(data)
n = len(data)
mid = n // 2
return data[mid] if n % 2 != 0 else (data[mid - 1] +
data[mid]) / 2
def mode(data):
frequency = {}
for num in data:
frequency[num] = frequency.get(num, 0) + 1
max_freq = max(frequency.values())
return [k for k, v in frequency.items() if v == max_freq]
def standard_deviation(data):
mu = mean(data)
variance = sum((x - mu) ** 2 for x in data) / len(data)
return variance ** 0.5
3. File I/O Operations
import json
import csv
# Text file
with open("example.txt", "w") as f:
f.write("Hello, World!")
# JSON file
data = {"name": "Alice", "age": 30}
with open("example.json", "w") as f:
json.dump(data, f)
# CSV file
rows = [["Name", "Age"], ["Bob", 25], ["Charlie", 30]]
with open("example.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(rows)
4. Error Handling and Exception Management
def divide(a, b):
try:
result = a / b
except ZeroDivisionError:
return "Cannot divide by zero!"
except TypeError:
return "Invalid input type!"
else:
return result
finally:
print("Division attempted.")
5. Code Documentation and PEP 8 Compliance
def mean(data):
"""
Calculate the mean (average) of a list of numbers.
Parameters:
data (list): List of numerical values
Returns:
float: Mean value
"""
return sum(data) / len(data)
Assignment 2: NumPy and Mathematical Computing
1. Array Manipulation and Broadcasting
import numpy as np
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
broadcasted_sum = a + b # Element-wise addition
matrix = np.array([[1, 2], [3, 4]])
matrix_transpose = matrix.T
reshaped = np.reshape(matrix, (1, 4))
2. Linear Algebra Operations
from numpy.linalg import eig, svd
A = np.array([[1, 2], [3, 4]])
product = np.dot(A, A)
eigenvalues, eigenvectors = eig(A)
U, S, Vt = svd(A)
3. Statistical Operations and Random Number Generation
random_array = np.random.rand(5)
mean_val = np.mean(random_array)
std_val = np.std(random_array)
median_val = np.median(random_array)
4. Performance Comparison: Python vs NumPy
import time
# Pure Python
start = time.time()
py_result = [i * 2 for i in range(1000000)]
end = time.time()
python_time = end - start
# NumPy
start = time.time()
np_result = np.arange(1000000) * 2
end = time.time()
numpy_time = end - start
print(f"Python time: {python_time}, NumPy time: {numpy_time}")
5. Vectorization Techniques
# Using vectorization for fast computation
arr = np.arange(10)
squared = arr ** 2
# Instead of using a loop
# squared = [x**2 for x in arr]
Assignment 3: Data Manipulation with Pandas
1. Data Cleaning and Preprocessing
import pandas as pd
import numpy as np
df = pd.read_csv("messy_data.csv")
df.columns = [col.strip().lower().replace(" ", "_") for col in
df.columns]
df.drop_duplicates(inplace=True)
df.fillna(df.mean(numeric_only=True), inplace=True)
2. Merging, Joining, and Concatenating
df1 = pd.DataFrame({'id': [1, 2], 'name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'id': [1, 2], 'age': [25, 30]})
merged = pd.merge(df1, df2, on='id')
concatenated = pd.concat([df1, df2], axis=1)
3. GroupBy Operations and Pivot Tables
grouped = df.groupby('category')['value'].mean()
pivot = df.pivot_table(values='value', index='category',
columns='year', aggfunc='sum')
4. Time Series Analysis and Date/Time Manipulation
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
monthly_avg = df.resample('M').mean()
5. Data Validation and Quality Assessment
# Check for missing values
missing = df.isnull().sum()
# Check for invalid data types or ranges
invalid_ages = df[df['age'] < 0]
# Summary statistics
summary = df.describe()
Assignment 4: Data Visualization
1. Visualizations using Matplotlib, Seaborn, and Plotly
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Matplotlib
plt.plot([1, 2, 3, 4], [10, 20, 25, 30])
plt.title('Line Plot')
plt.show()
# Seaborn
sns.histplot(data=df, x='age', bins=10, kde=True)
plt.show()
# Plotly
fig = px.bar(df, x='category', y='value')
fig.show()
2. Interactive Dashboards with Plotly Dash or Streamlit
# Streamlit app example
import streamlit as st
st.title('Interactive Dashboard')
age = st.slider('Select Age', 0, 100, 25)
st.write(f'You selected age: {age}')
3. Statistical Plots
# Correlation Matrix
correlation = df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.show()
# Distribution Plot
sns.displot(df['value'], kde=True)
plt.show()
# Box Plot
sns.boxplot(x='category', y='value', data=df)
plt.show()
4. Geospatial Data Visualization
import geopandas as gpd
world =
gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.plot()
plt.show()
5. Custom Visualization Functions and Styling
def custom_plot(df, x, y, title):
plt.figure(figsize=(10,6))
plt.plot(df[x], df[y], color='green', marker='o')
plt.title(title)
plt.xlabel(x)
plt.ylabel(y)
plt.grid(True)
plt.show()
custom_plot(df, 'date', 'value', 'Value over Time')
Assignments 5 & 6: MSc in Python
Programming for Data Science
Assignment 5: Web Scraping and API Integration (Week 10)
**Weight: 10%**
1. Web Scraping with BeautifulSoup
Scraping quotes using BeautifulSoup:
import requests
from bs4 import BeautifulSoup
url = "http://quotes.toscrape.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
for quote in soup.find_all("div", class_="quote"):
text = quote.find("span", class_="text").text
author = quote.find("small", class_="author").text
print(f"{text} - {author}")
2. Scrapy Spider Example
Scrapy spider for the same site:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://quotes.toscrape.com']
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
}
3. REST API Consumption
Fetching posts from JSONPlaceholder:
import requests
url = "https://jsonplaceholder.typicode.com/posts"
response = requests.get(url)
posts = response.json()
for post in posts[:5]:
print(f"Title: {post['title']}\nBody: {post['body']}\n")
4. XML and HTML Handling
import xml.etree.ElementTree as ET
xml_data = """<root><item><name>Item 1</name></item></root>"""
root = ET.fromstring(xml_data)
for item in root.findall('item'):
print(item.find('name').text)
5. Rate Limiting and Ethics
Implemented delay to avoid overloading servers:
import time
for i in range(5):
print(f"Fetching page {i}")
time.sleep(2) # delay of 2 seconds
6. Data Pipeline
import schedule
import time
def job():
print("Scraping data...")
schedule.every().day.at("10:00").do(job)
while True:
schedule.run_pending()
time.sleep(1)
Assignment 6: Machine Learning Implementation (Week 11-12)
**Weight: 20%**
1. Linear Regression from Scratch
import numpy as np
X = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])
m = b = 0
L = 0.01
epochs = 1000
for _ in range(epochs):
y_pred = m * X + b
error = y - y_pred
m -= L * (-2 * (X * error).mean())
b -= L * (-2 * error.mean())
print(f"y = {m:.2f}x + {b:.2f}")
2. K-Means Clustering
from sklearn.cluster import KMeans
import numpy as np
data = np.array([[1, 2], [1, 4], [1, 0],
[10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
print(kmeans.labels_)
3. Decision Tree using Scikit-learn
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
clf = DecisionTreeClassifier()
clf.fit(iris.data, iris.target)
print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))
4. Cross-Validation and Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
params = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, params)
clf.fit(iris.data, iris.target)
print(clf.best_params_)
5. Model Evaluation Metrics
from sklearn.metrics import classification_report
y_pred = clf.predict(iris.data)
print(classification_report(iris.target, y_pred))