Ex-13 Data Science
Ex-13 Data Science
12/10/24
AIM:
To implement python code to load a data file and do some data analysis over it.
ALGORITHM:
# Assuming the first column is 'Year', second is 'Hares', third is 'Lynxes', and fourth is
'Carrots'
years = data[:, 0]
hares = data[:, 1]
lynxes = data[:, 2]
carrots = data[:, 3]
mean_l = np.mean(lynxes)
std_l = np.std(lynxes)
mean_c = np.mean(carrots)
std_c = np.std(carrots)
OUTPUT:
Mean and Standard Deviation of Populations:
Hares: Mean = 34080.95238095238, Std Dev = 20897.906458089667
Lynxes: Mean = 20166.666666666668, Std Dev = 16254.591536908763
Carrots: Mean = 42400.0, Std Dev = 3322.506225584478
OUTPUT:
Years of Maximum Population:
Hares: Year = 1903.0, Population = 77400.0
Lynxes: Year = 1904.0, Population = 59400.0
Carrots: Year = 1900.0, Population = 48300.0
# (iii) Which species has the largest population for each year?
largest_species = np.argmax(np.array([hares, lynxes, carrots]), axis=0)
species_names = ['Hares', 'Lynxes', 'Carrots']
largest_species_names = [species_names[i] for i in largest_species]
OUTPUT:
Largest Population for Each Year:
Year 1900.0: Carrots
Year 1901.0: Carrots
Year 1902.0: Hares
Year 1903.0: Hares
Year 1904.0: Lynxes
Year 1905.0: Lynxes
Year 1906.0: Carrots
Year 1907.0: Carrots
Year 1908.0: Carrots
Year 1909.0: Carrots
Year 1910.0: Carrots
Year 1911.0: Carrots
Year 1912.0: Hares
Year 1913.0: Hares
Year 1914.0: Hares
Year 1915.0: Lynxes
Year 1916.0: Carrots
Year 1917.0: Carrots
Year 1918.0: Carrots
Year 1919.0: Carrots
Year 1920.0: Carrots
OUTPUT:
Years with populations above 50000: [1902. 1903. 1904. 1912. 1913. 1914. 1915.]
# (v) The top two years for each species when they had the lowest populations.
lowest_h = np.argsort(hares)[:2]
lowest_l = np.argsort(lynxes)[:2]
lowest_c = np.argsort(carrots)[:2]
print(f"\nCorrelation Coefficients:")
print(f"Hares and Lynxes: {correlation_hares_lynxes}")
print(f"Hares and Carrots: {correlation_hares_carrots}")
print(f"Lynxes and Carrots: {correlation_lynxes_carrots}")
OUTPUT:
Correlation Coefficients:
Hares and Lynxes: 0.07189206073535571
Hares and Carrots: -0.016603777709879402
Lynxes and Carrots: -0.6805771698401617
OUTPUT:
Covariance:
Hares and Lynxes: 25641833.33333334
Hares and Carrots: -1210499.9999999998
Lynxes and Carrots: -38592999.99999999
OUTPUT:
Year with Largest Combined Population: Year = 1903.0, Population = 150800.0
OUTPUT:
Hare population does not follow a normal distribution (reject H0)
OUTPUT:
Yearly Differences:
Hares - Lynxes: [ 26000. 41100. 60400. 42200. -23100. -21100. -900. 8400.
13700.
16300. 19700. 32300. 44700. 57100. 6600. -31600. -18500. -8200.
4900. 6100. 16100.]
Hares - Carrots: [-18300. -1000. 28700. 39200. -4300. -19200. -20500. -20900. -
22500.
-16700. -18900. -6500. 13200. 35700. 12900. -19500. -25500. -34200.
-28700. -25100. -22600.]
Lynxes - Carrots: [-44300. -42100. -31700. -3000. 18800. 1900. -19600. -29300. -
36200.
-33000. -38600. -38800. -31500. -21400. 6300. 12100. -7000. -26000.
-33600. -31200. -38700.]
#(6) Plot this intrayear differences between all pairs of attributes (hare-lynx,
hare-carrot, lynx-carrot)
#as subplots. Provide a title, legend and labels.
# Step 6: Plot the differences
plt.figure(figsize=(10, 6))
# Hares - Lynxes
plt.subplot(3, 1, 1)
plt.plot(years, hare_lynx_diff, label='Hares - Lynxes', marker='o', color='blue')
plt.title('Yearly Population Difference: Hares - Lynxes')
plt.xlabel('Year')
plt.ylabel('Population Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')
plt.legend()
plt.grid()
# Hares - Carrots
plt.subplot(3, 1, 2)
plt.plot(years, hare_carrot_diff, label='Hares - Carrots', marker='s', color='green')
plt.title('Yearly Population Difference: Hares - Carrots')
plt.xlabel('Year')
plt.ylabel('Population Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')
plt.legend()
plt.grid()
# Lynxes - Carrots
plt.subplot(3, 1, 3)
plt.plot(years, lynx_carrot_diff, label='Lynxes - Carrots', marker='^', color='orange')
plt.title('Yearly Population Difference: Lynxes - Carrots')
plt.xlabel('Year')
plt.ylabel('Population Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
OUTPUT:
#(7) When is it maximum? minimum? mean? standard deviation? of the yearly
differences
# Step 7: Calculate statistics
def calculate_statistics(diff):
max_diff = np.max(diff)
min_diff = np.min(diff)
mean_diff = np.mean(diff)
std_diff = np.std(diff)
return max_diff, min_diff, mean_diff, std_diff
# Print results
print("Hares - Lynxes Statistics: Max: {:.2f}, Min: {:.2f}, Mean: {:.2f}, Std:
{:.2f}".format(*hare_lynx_stats))
print("Hares - Carrots Statistics: Max: {:.2f}, Min: {:.2f}, Mean: {:.2f}, Std:
{:.2f}".format(*hare_carrot_stats))
print("Lynxes - Carrots Statistics: Max: {:.2f}, Min: {:.2f}, Mean: {:.2f}, Std:
{:.2f}".format(*lynx_carrot_stats))
OUTPUT:
Hares - Lynxes Statistics: Max: 60400.00, Min: -31600.00, Mean: 13914.29, Std:
25536.10
Hares - Carrots Statistics: Max: 39200.00, Min: -34200.00, Mean: -8319.05, Std:
21214.79
Lynxes - Carrots Statistics: Max: 18800.00, Min: -44300.00, Mean: -22233.33, Std:
18675.15
#(8) Does the hare population outnumber the lynx population in more years or
vice-versa?
# Count the years where hares outnumber lynxes
hare_greater = np.sum(hares > lynxes)
lynx_greater = np.sum(lynxes > hares)
OUTPUT:
Hares outnumber lynxes in 15 years.
#(9) Create a Class with methods for the above questions, with the dataset in the
constructor.
class PopulationAnalysis:
def __init__(self, filepath):
self.data = np.loadtxt(filepath, delimiter="\t", skiprows=1)
self.years = self.data[:, 0]
self.hares = self.data[:, 1]
self.lynxes = self.data[:, 2]
self.carrots = self.data[:, 3]
def compare_hare_lynx(self):
hare_greater = np.sum(self.hares > self.lynxes)
lynx_greater = np.sum(self.lynxes > self.hares)
return hare_greater, lynx_greater
# Usage
analysis = PopulationAnalysis("/content/Populations.txt")
hare_count, lynx_count = analysis.compare_hare_lynx()
print(f"Hares outnumber lynxes in {hare_count} years; lynxes in {lynx_count}
years.")
Hares outnumber lynxes in 15 years; lynxes in 6 years.
#(10) How many missing values (n.a) are there in the dataset?
missing_values = np.sum(np.isnan(data))
print(f"Number of missing values: {missing_values}")
Number of missing values: 0
#(11) Replace any missing values with the mean value for that column (or field)
# Load the data again for reference
data = np.loadtxt("/content/Populations.txt", delimiter="\t", skiprows=1)
OUTPUT:
Data with missing values replaced by column means:
[[ 1900. 30000. 4000. 48300.]
[ 1901. 47200. 6100. 48200.]
[ 1902. 70200. 9800. 41500.]
[ 1903. 77400. 35200. 38200.]
[ 1904. 36300. 59400. 40600.]
[ 1905. 20600. 41700. 39800.]
[ 1906. 18100. 19000. 38600.]
[ 1907. 21400. 13000. 42300.]
[ 1908. 22000. 8300. 44500.]
[ 1909. 25400. 9100. 42100.]
[ 1910. 27100. 7400. 46000.]
[ 1911. 40300. 8000. 46800.]
[ 1912. 57000. 12300. 43800.]
[ 1913. 76600. 19500. 40900.]
[ 1914. 52300. 45700. 39400.]
[ 1915. 19500. 51100. 39000.]
[ 1916. 11200. 29700. 36700.]
[ 1917. 7600. 15800. 41800.]
[ 1918. 14600. 9700. 43300.]
[ 1919. 16200. 10100. 41300.]
[ 1920. 24700. 8600. 47300.]]
#(12) If there is a missing value, raise a user-defined NAError displaying the row
and column where the n.a occurs.
class NAError(Exception):
def __init__(self, row, col):
self.row = row
self.col = col
super().__init__(f"Missing value found at row {row}, column {col}")
OUTPUT:
No missing values found in the dataset.
#(13) Test the hypothesis that the average hare population is 4000
from scipy import stats
OUTPUT:
Random sample of carrots: [38600. 38200. 40600. 48300. 41500. 48200. 39000.
42300. 44500. 36700.
39800. 41300. 47300. 39400. 42100. 46000. 43300. 46800. 43800. 41800.]
#(15) Create a 21x3 matrix with the hare, lynx, and carrot values. Find the
eigenvalues of this matrix.
# Create a 21x3 matrix
population_matrix = np.column_stack((hares, lynxes, carrots))
OUTPUT:
Eigenvalues: [4.62214104e+08 2.79269294e+08 6.08855484e+06]
#(16) Use regular expressions to count the characters, numbers, and special
characters in the raw dataset.
import re
OUTPUT:
Characters: 400, Numbers: 340, Special Characters: 122
#(17) If one of the columns were normally distributed, compute the z-score of the
values.
# Assuming 'hares' is normally distributed
z_scores = (hares - np.mean(hares)) / np.std(hares)
print("Z-scores of hare population:", z_scores)
OUTPUT:
Z-scores of hare population: [-0.19528044 0.62776851 1.72835723 2.07288934
0.10618516 -0.64508626
-0.76471547 -0.60680492 -0.57809391 -0.41539818 -0.33405032 0.2975919
1.09671501 2.034608 0.8718121 -0.69772311 -1.09489209 -1.26715814
-0.93219636 -0.85563367 -0.44889436]
OUTPUT:
Hare Population Skewness: 0.8707926603586706,
Kurtosis: -0.4713852285780282
RESULT:
Hence a data file is loaded and a complete statistical analysis has been performed
using python code.