0% found this document useful (0 votes)
27 views11 pages

Ex-13 Data Science

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
27 views11 pages

Ex-13 Data Science

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

EX NO 13

DATE DATA SCIENCE

12/10/24

AIM:
To implement python code to load a data file and do some data analysis over it.

ALGORITHM:

#LOAD A DATA FILE


import numpy as np
import matplotlib.pyplot as plt

# Load the data from the TSV file


data = np.loadtxt("/content/Populations.txt", delimiter="\t", skiprows=1)

# Assuming the first column is 'Year', second is 'Hares', third is 'Lynxes', and fourth is
'Carrots'
years = data[:, 0]
hares = data[:, 1]
lynxes = data[:, 2]
carrots = data[:, 3]

# Plotting the populations


plt.figure(figsize=(6, 3))
plt.plot(years, hares, label='Hares', marker='o')
plt.plot(years, lynxes, label='Lynxes', marker='s')
plt.plot(years, carrots, label='Carrots', marker='^')

# Adding titles and labels


plt.title('Population of Hares, Lynxes, and Carrots Over 20 Years')
plt.xlabel('Year')
plt.ylabel('Population')
plt.legend()
plt.grid()

# Show the plot


plt.show()
# (i) Mean and standard deviation of the populations of each species
mean_h = np.mean(hares)
std_h = np.std(hares)

mean_l = np.mean(lynxes)
std_l = np.std(lynxes)

mean_c = np.mean(carrots)
std_c = np.std(carrots)

print(f"Mean and Standard Deviation of Populations:")


print(f"Hares: Mean = {mean_h}, Std Dev = {std_h}")
print(f"Lynxes: Mean = {mean_l}, Std Dev = {std_l}")
print(f"Carrots: Mean = {mean_c}, Std Dev = {std_c}")

OUTPUT:
Mean and Standard Deviation of Populations:
Hares: Mean = 34080.95238095238, Std Dev = 20897.906458089667
Lynxes: Mean = 20166.666666666668, Std Dev = 16254.591536908763
Carrots: Mean = 42400.0, Std Dev = 3322.506225584478

# (ii) Which year each species had the largest population?


year_h_max = years[np.argmax(hares)]
year_l_max = years[np.argmax(lynxes)]
year_c_max = years[np.argmax(carrots)]

print(f"\nYears of Maximum Population:")


print(f"Hares: Year = {year_h_max}, Population = {hares.max()}")
print(f"Lynxes: Year = {year_l_max}, Population = {lynxes.max()}")
print(f"Carrots: Year = {year_c_max}, Population = {carrots.max()}")

OUTPUT:
Years of Maximum Population:
Hares: Year = 1903.0, Population = 77400.0
Lynxes: Year = 1904.0, Population = 59400.0
Carrots: Year = 1900.0, Population = 48300.0
# (iii) Which species has the largest population for each year?
largest_species = np.argmax(np.array([hares, lynxes, carrots]), axis=0)
species_names = ['Hares', 'Lynxes', 'Carrots']
largest_species_names = [species_names[i] for i in largest_species]

print(f"\nLargest Population for Each Year:")


for year, species in zip(years, largest_species_names):
print(f"Year {year}: {species}")

OUTPUT:
Largest Population for Each Year:
Year 1900.0: Carrots
Year 1901.0: Carrots
Year 1902.0: Hares
Year 1903.0: Hares
Year 1904.0: Lynxes
Year 1905.0: Lynxes
Year 1906.0: Carrots
Year 1907.0: Carrots
Year 1908.0: Carrots
Year 1909.0: Carrots
Year 1910.0: Carrots
Year 1911.0: Carrots
Year 1912.0: Hares
Year 1913.0: Hares
Year 1914.0: Hares
Year 1915.0: Lynxes
Year 1916.0: Carrots
Year 1917.0: Carrots
Year 1918.0: Carrots
Year 1919.0: Carrots
Year 1920.0: Carrots

# (iv) Which years any of the populations is above 50000?


years_above_50000 = years[(hares > 50000) | (lynxes > 50000) | (carrots > 50000)]
print(f"\nYears with populations above 50000: {years_above_50000}")

OUTPUT:
Years with populations above 50000: [1902. 1903. 1904. 1912. 1913. 1914. 1915.]

# (v) The top two years for each species when they had the lowest populations.
lowest_h = np.argsort(hares)[:2]
lowest_l = np.argsort(lynxes)[:2]
lowest_c = np.argsort(carrots)[:2]

print(f"\nTop Two Years with Lowest Populations:")


print(f"Hares: Year = {years[lowest_h]}, Population = {hares[lowest_h]}")
print(f"Lynxes: Year = {years[lowest_l]}, Population = {lynxes[lowest_l]}")
print(f"Carrots: Year = {years[lowest_c]}, Population = {carrots[lowest_c]}")
OUTPUT:
Top Two Years with Lowest Populations:
Hares: Year = [1917. 1916.], Population = [ 7600. 11200.]
Lynxes: Year = [1900. 1901.], Population = [4000. 6100.]
Carrots: Year = [1916. 1903.], Population = [36700. 38200.]

# (1) Correlation between the populations of the species


correlation_hares_lynxes = np.corrcoef(hares, lynxes)[0, 1]
correlation_hares_carrots = np.corrcoef(hares, carrots)[0, 1]
correlation_lynxes_carrots = np.corrcoef(lynxes, carrots)[0, 1]

print(f"\nCorrelation Coefficients:")
print(f"Hares and Lynxes: {correlation_hares_lynxes}")
print(f"Hares and Carrots: {correlation_hares_carrots}")
print(f"Lynxes and Carrots: {correlation_lynxes_carrots}")

OUTPUT:
Correlation Coefficients:
Hares and Lynxes: 0.07189206073535571
Hares and Carrots: -0.016603777709879402
Lynxes and Carrots: -0.6805771698401617

# (2) Covariance between the populations of the species


covariance_hares_lynxes = np.cov(hares, lynxes)[0, 1]
covariance_hares_carrots = np.cov(hares, carrots)[0, 1]
covariance_lynxes_carrots = np.cov(lynxes, carrots)[0, 1]
print(f"\nCovariance:")
print(f"Hares and Lynxes: {covariance_hares_lynxes}")
print(f"Hares and Carrots: {covariance_hares_carrots}")
print(f"Lynxes and Carrots: {covariance_lynxes_carrots}")

OUTPUT:
Covariance:
Hares and Lynxes: 25641833.33333334
Hares and Carrots: -1210499.9999999998
Lynxes and Carrots: -38592999.99999999

# (3) Which year had the largest combined population?


combined_population = hares + lynxes + carrots
year_largest_combined_population = years[np.argmax(combined_population)]
largest_combined_population = combined_population.max()
print(f"\nYear with Largest Combined Population: Year =
{year_largest_combined_population}, Population = {largest_combined_population}")

OUTPUT:
Year with Largest Combined Population: Year = 1903.0, Population = 150800.0

# (4) Does the hare population follow a normal distribution?


from scipy.stats import shapiro # Import the required function from scipy.stats module
stat, p_value = shapiro(hares)
alpha = 0.05
if p_value > alpha:
print("Hare population follows a normal distribution (fail to reject H0)")
else:
print("Hare population does not follow a normal distribution (reject H0)")

OUTPUT:
Hare population does not follow a normal distribution (reject H0)

#(5) Calculate difference between the populations for every year


# Calculate the differences between populations for each pair
hare_lynx_diff = hares - lynxes # Calculate difference between hare and lynx
population
hare_carrot_diff = hares - carrots # Calculate difference between hare and carrot
population
lynx_carrot_diff = lynxes - carrots # Calculate difference between lynx and carrot
population
# Print the differences for each pair
print("Yearly Differences:")
print("Hares - Lynxes:", hare_lynx_diff)
print("Hares - Carrots:", hare_carrot_diff)
print("Lynxes - Carrots:", lynx_carrot_diff)

OUTPUT:
Yearly Differences:
Hares - Lynxes: [ 26000. 41100. 60400. 42200. -23100. -21100. -900. 8400.
13700.
16300. 19700. 32300. 44700. 57100. 6600. -31600. -18500. -8200.
4900. 6100. 16100.]
Hares - Carrots: [-18300. -1000. 28700. 39200. -4300. -19200. -20500. -20900. -
22500.
-16700. -18900. -6500. 13200. 35700. 12900. -19500. -25500. -34200.
-28700. -25100. -22600.]
Lynxes - Carrots: [-44300. -42100. -31700. -3000. 18800. 1900. -19600. -29300. -
36200.
-33000. -38600. -38800. -31500. -21400. 6300. 12100. -7000. -26000.
-33600. -31200. -38700.]

#(6) Plot this intrayear differences between all pairs of attributes (hare-lynx,
hare-carrot, lynx-carrot)
#as subplots. Provide a title, legend and labels.
# Step 6: Plot the differences
plt.figure(figsize=(10, 6))

# Hares - Lynxes
plt.subplot(3, 1, 1)
plt.plot(years, hare_lynx_diff, label='Hares - Lynxes', marker='o', color='blue')
plt.title('Yearly Population Difference: Hares - Lynxes')
plt.xlabel('Year')
plt.ylabel('Population Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')
plt.legend()
plt.grid()

# Hares - Carrots
plt.subplot(3, 1, 2)
plt.plot(years, hare_carrot_diff, label='Hares - Carrots', marker='s', color='green')
plt.title('Yearly Population Difference: Hares - Carrots')
plt.xlabel('Year')
plt.ylabel('Population Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')
plt.legend()
plt.grid()

# Lynxes - Carrots
plt.subplot(3, 1, 3)
plt.plot(years, lynx_carrot_diff, label='Lynxes - Carrots', marker='^', color='orange')
plt.title('Yearly Population Difference: Lynxes - Carrots')
plt.xlabel('Year')
plt.ylabel('Population Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

OUTPUT:
#(7) When is it maximum? minimum? mean? standard deviation? of the yearly
differences
# Step 7: Calculate statistics
def calculate_statistics(diff):
max_diff = np.max(diff)
min_diff = np.min(diff)
mean_diff = np.mean(diff)
std_diff = np.std(diff)
return max_diff, min_diff, mean_diff, std_diff

# Get statistics for each difference


hare_lynx_stats = calculate_statistics(hare_lynx_diff)
hare_carrot_stats = calculate_statistics(hare_carrot_diff)
lynx_carrot_stats = calculate_statistics(lynx_carrot_diff)

# Print results
print("Hares - Lynxes Statistics: Max: {:.2f}, Min: {:.2f}, Mean: {:.2f}, Std:
{:.2f}".format(*hare_lynx_stats))
print("Hares - Carrots Statistics: Max: {:.2f}, Min: {:.2f}, Mean: {:.2f}, Std:
{:.2f}".format(*hare_carrot_stats))
print("Lynxes - Carrots Statistics: Max: {:.2f}, Min: {:.2f}, Mean: {:.2f}, Std:
{:.2f}".format(*lynx_carrot_stats))

OUTPUT:
Hares - Lynxes Statistics: Max: 60400.00, Min: -31600.00, Mean: 13914.29, Std:
25536.10
Hares - Carrots Statistics: Max: 39200.00, Min: -34200.00, Mean: -8319.05, Std:
21214.79
Lynxes - Carrots Statistics: Max: 18800.00, Min: -44300.00, Mean: -22233.33, Std:
18675.15

#(8) Does the hare population outnumber the lynx population in more years or
vice-versa?
# Count the years where hares outnumber lynxes
hare_greater = np.sum(hares > lynxes)
lynx_greater = np.sum(lynxes > hares)

if hare_greater > lynx_greater:


print(f"Hares outnumber lynxes in {hare_greater} years.")
else:
print(f"Lynxes outnumber hares in {lynx_greater} years.")

OUTPUT:
Hares outnumber lynxes in 15 years.

#(9) Create a Class with methods for the above questions, with the dataset in the
constructor.
class PopulationAnalysis:
def __init__(self, filepath):
self.data = np.loadtxt(filepath, delimiter="\t", skiprows=1)
self.years = self.data[:, 0]
self.hares = self.data[:, 1]
self.lynxes = self.data[:, 2]
self.carrots = self.data[:, 3]

def compare_hare_lynx(self):
hare_greater = np.sum(self.hares > self.lynxes)
lynx_greater = np.sum(self.lynxes > self.hares)
return hare_greater, lynx_greater

# Usage
analysis = PopulationAnalysis("/content/Populations.txt")
hare_count, lynx_count = analysis.compare_hare_lynx()
print(f"Hares outnumber lynxes in {hare_count} years; lynxes in {lynx_count}
years.")
Hares outnumber lynxes in 15 years; lynxes in 6 years.
#(10) How many missing values (n.a) are there in the dataset?
missing_values = np.sum(np.isnan(data))
print(f"Number of missing values: {missing_values}")
Number of missing values: 0
#(11) Replace any missing values with the mean value for that column (or field)
# Load the data again for reference
data = np.loadtxt("/content/Populations.txt", delimiter="\t", skiprows=1)

# Calculate means, ignoring NaNs


means = np.nanmean(data, axis=0)

# Replace NaNs with the mean for each column


data_with_nan_replaced = np.where(np.isnan(data), means, data)

print("Data with missing values replaced by column means:")


print(data_with_nan_replaced)

OUTPUT:
Data with missing values replaced by column means:
[[ 1900. 30000. 4000. 48300.]
[ 1901. 47200. 6100. 48200.]
[ 1902. 70200. 9800. 41500.]
[ 1903. 77400. 35200. 38200.]
[ 1904. 36300. 59400. 40600.]
[ 1905. 20600. 41700. 39800.]
[ 1906. 18100. 19000. 38600.]
[ 1907. 21400. 13000. 42300.]
[ 1908. 22000. 8300. 44500.]
[ 1909. 25400. 9100. 42100.]
[ 1910. 27100. 7400. 46000.]
[ 1911. 40300. 8000. 46800.]
[ 1912. 57000. 12300. 43800.]
[ 1913. 76600. 19500. 40900.]
[ 1914. 52300. 45700. 39400.]
[ 1915. 19500. 51100. 39000.]
[ 1916. 11200. 29700. 36700.]
[ 1917. 7600. 15800. 41800.]
[ 1918. 14600. 9700. 43300.]
[ 1919. 16200. 10100. 41300.]
[ 1920. 24700. 8600. 47300.]]

#(12) If there is a missing value, raise a user-defined NAError displaying the row
and column where the n.a occurs.
class NAError(Exception):
def __init__(self, row, col):
self.row = row
self.col = col
super().__init__(f"Missing value found at row {row}, column {col}")

# Check for missing values and raise error if found


for i in range(data.shape[0]):
for j in range(data.shape[1]):
if np.isnan(data[i, j]):
raise NAError(i, j)

# If no NAError is raised, print confirmation


print("No missing values found in the dataset.")

OUTPUT:
No missing values found in the dataset.

#(13) Test the hypothesis that the average hare population is 4000
from scipy import stats

# Conduct a one-sample t-test


t_stat, p_value = stats.ttest_1samp(hares, 4000)
print(f"T-statistic: {t_stat}, P-value: {p_value}")
T-statistic: 6.437300739826584, P-value: 2.8073145089645608e-06
#(14) Take a random sample of size 20 from the carrot attribute
sample_carrots = np.random.choice(carrots, size=20, replace=False)
print("Random sample of carrots:", sample_carrots)

OUTPUT:
Random sample of carrots: [38600. 38200. 40600. 48300. 41500. 48200. 39000.
42300. 44500. 36700.
39800. 41300. 47300. 39400. 42100. 46000. 43300. 46800. 43800. 41800.]

#(15) Create a 21x3 matrix with the hare, lynx, and carrot values. Find the
eigenvalues of this matrix.
# Create a 21x3 matrix
population_matrix = np.column_stack((hares, lynxes, carrots))

# Calculate the covariance matrix to obtain a square matrix


covariance_matrix = np.cov(population_matrix.T)

#Find the eigenvalues of the covariance matrix


eigenvalues = np.linalg.eigvals(covariance_matrix)
print("Eigenvalues:", eigenvalues)

OUTPUT:
Eigenvalues: [4.62214104e+08 2.79269294e+08 6.08855484e+06]

#(16) Use regular expressions to count the characters, numbers, and special
characters in the raw dataset.
import re

# Load raw dataset as a string


with open("/content/Populations.txt", 'r') as file:
raw_data = file.read()

characters = len(re.findall(r'\w', raw_data)) # Alphanumeric characters


numbers = len(re.findall(r'\d', raw_data)) # Digits
special_characters = len(re.findall(r'\W', raw_data)) # Special characters

print(f"Characters: {characters}, Numbers: {numbers}, Special Characters:


{special_characters}")

OUTPUT:
Characters: 400, Numbers: 340, Special Characters: 122

#(17) If one of the columns were normally distributed, compute the z-score of the
values.
# Assuming 'hares' is normally distributed
z_scores = (hares - np.mean(hares)) / np.std(hares)
print("Z-scores of hare population:", z_scores)

OUTPUT:
Z-scores of hare population: [-0.19528044 0.62776851 1.72835723 2.07288934
0.10618516 -0.64508626
-0.76471547 -0.60680492 -0.57809391 -0.41539818 -0.33405032 0.2975919
1.09671501 2.034608 0.8718121 -0.69772311 -1.09489209 -1.26715814
-0.93219636 -0.85563367 -0.44889436]

#(18) Compute the skewness & kurtosis

from scipy.stats import skew, kurtosis


hare_skewness = skew(hares)
hare_kurtosis = kurtosis(hares)
print(f"Hare Population Skewness: {hare_skewness}, Kurtosis: {hare_kurtosis}")

OUTPUT:
Hare Population Skewness: 0.8707926603586706,
Kurtosis: -0.4713852285780282

RESULT:
Hence a data file is loaded and a complete statistical analysis has been performed
using python code.

You might also like