0% found this document useful (0 votes)
42 views

56 Assignments

This document discusses SciPy, Pandas, and NumPy libraries. It shows how to use SciPy functions like linalg to solve linear equations and random distributions. It demonstrates creating and manipulating DataFrames in Pandas from different data structures. It also covers merging, concatenating and handling missing data in DataFrames.

Uploaded by

kPrasad8
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
42 views

56 Assignments

This document discusses SciPy, Pandas, and NumPy libraries. It shows how to use SciPy functions like linalg to solve linear equations and random distributions. It demonstrates creating and manipulating DataFrames in Pandas from different data structures. It also covers merging, concatenating and handling missing data in DataFrames.

Uploaded by

kPrasad8
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 12

SCIPY

#import required libraries


import numpy as np
from scipy import linalg
#Test has 30 questions and worth 150 marks
#True and false questions worth 4 marks each
#multiple choice questions worth 9 points each

#let x is the number of true/ false questions


#let y is the number of multiple choice questions

# (x + y = 30 )
# (4x + 9y = 150)
testQuestionVariable = np.array([[1,1],[4,9]])
testQuestionValue = np.array([30,150])
#use linalg function of Scipy
#use solve method to solve the linear equation and find value for x and y
linalg.solve(testQuestionVariable,testQuestionValue)

#import required library for normal distribution


from scipy.stats import norm
#define 20 randmon variables for normal distribution of data
norm.rvs(loc=0,scale=1,size=20)
# perfrom Cumulative Distribution Function or CDF for 10 random variables, loc=1 and scale 3
norm.cdf(10,loc=1,scale=3)
# perfrom Probability Density Function or PDF for 14 random variables, loc=1 and scale 1
norm.pdf(14,loc=1,scale=1)

#import the required libraries


import numpy as np
from scipy import linalg
#test_data matrix - (rating on scale of 10)
test_rating_data = np.array([[5,8],[7,9]])
eigenValues, eigenVector = linalg.eig(test_rating_data)
first_eigen, second_eigen = eigenValues
#print eigen values (first and second eigen values)
print(first_eigen, second_eigen)
#print first eigen vector
print(eigenVector[:,0])
#print second eigen vector
print(eigenVector[:,1])
PANDAS
Dataframe.ipynb:
import pandas as pd

#Create DataFrame from dict of equal length list

#last five olymnics data: place, year and number of countries participated
olympic_data_list = {'HostCity':['London','Beijing','Athens','Sydney','Atlanta'],
'Year':[2012,2008,2004,2000,1996],
'No. of Participating Countries':[205,204,201,200,197]
}
df_olympic_data = pd.DataFrame(olympic_data_list)
df_olympic_data

#Create DataFrame from dict of dicts

olympic_data_dict = {'London':{2012:205},'Beijing':{2008:204}}
df_olympic_data_dict = pd.DataFrame(olympic_data_dict)
df_olympic_data_dict
#select by City name
df_olympic_data.HostCity
#use describe function to display the content
df_olympic_data.describe

#Create DataFrame from dict of series

olympic_series_participation =
pd.Series([205,204,201,200,197],index=[2012,2008,2004,2000,1996])
olympic_series_country = pd.Series(['London','Beijing','Athens','Sydney','Atlanta'],
index=[2012,2008,2004,2000,1996])
df_olympic_series = pd.DataFrame({'No. of Participating
Countries':olympic_series_participation,
'Host Cities':olympic_series_country})
df_olympic_series

#Create DataFrame from dict of ndarray

import numpy as np
np_array = np.array([2012,2008,2004,2006])
dict_ndarray = {'year':np_array}
df_ndarray = pd.DataFrame(dict_ndarray)
df_ndarray
Create DataFrame from DataFrame object

df_from_df = pd.DataFrame(df_olympic_series)
df_from_df
#view values
df_from_df.values

View dataset

#view top 2 rows of the dataset


df_from_df.head(2)
#view bottom two rows of dataset
df_from_df.tail(2)
#view indexes of dataset
df_from_df.index
#view columns of the dataset
df_from_df.columns

Select dataset

#select column name from the dataset


df_from_df['No. of Participating Countries']
#another selecion by column name
df_from_df['Host Cities']
#select elements by index location
df_from_df.loc[2012]
#select elements by slicing from 0 to 2
df_from_df.iloc[0:2]
#select element by position
df_from_df.iat[2,1]
# select element by boolean indexing where countries participated are more than 200
df_from_df[df_from_df['No. of Participating Countries']>200]

#View & Select Data

#import libraries
import numpy as np
import pandas as pd

#create dataframe from dict of series for summer olympics : 1996 to 2012
olympic_series_participation =
pd.Series([205,204,201,200,197],index=[2012,2008,2004,2000,1996])
olympic_series_country = pd.Series(['London','Beijing','Athens','Sydney','Atlanta'],
index=[2012,2008,2004,2000,1996])
df_olympic_series = pd.DataFrame({'No. of Participating
Countries':olympic_series_participation,
'Host Cities':olympic_series_country})

# display content of the dataset


df_olympic_series

View Data

#view dataframe describe


df_olympic_series.describe
#view top 2 records
df_olympic_series.head(2)
#view last 3 records
df_olympic_series.tail(3)
#view indexes of dataset
df_olympic_series.index
#view columns of the dataset
df_olympic_series.columns

Select Data
#select data for Host Cities
df_olympic_series['Host Cities']
#another data selecion No. of Participating Countries
df_olympic_series['No. of Participating Countries']
#select lable-location based access by label
df_olympic_series.loc[2012]
#Integer-location based indexing by position
df_olympic_series.iloc[0:2]
#Integer-location based data selection by index value
df_olympic_series.iat[3,1]
#select data element by condition where number of participated countries are more than 200
# hint - use boolean expression
df_olympic_series[df_olympic_series['No. of Participating Countries']>200]

Data Operation Demo


#import libraries
import pandas as pd
#create test score dataset for test takers
df_test_scores = pd.DataFrame({'Math':[91,97,66,83,45],
'English':[93,88,55,65,74]},
index=['James','David','Stacy','Travis','Mike'])
#view the content of the dataset
df_test_scores.describe
#use describe() function to view dataset statistics
df_test_scores.describe()
#define a custom function to grade the test scores
def test_grade(score):
if score>90:
return 'A'
elif score>80:
return 'B'
elif score>70:
return 'C'
elif score>60:
return 'D'
else:
return 'F'
#validate/test the custom function
test_grade(85)
#use applymap method to the dataset to view the grade for tests
df_test_scores.applymap(test_grade)

#Merge Duplicate Cocatenate


import numpy as np
import pandas as pd
df_student_test_math_data = pd.DataFrame({'student':['Tom','Jack','Dan','Ram','Jeff','David'],
'ID':[10,56,31,85,9,22]
})
df_student_test_science_data = pd.DataFrame({'student':['Tom','Ram','David'],
'ID':[10,85,22]
})
pd.merge(df_student_test_math_data,df_student_test_science_data)
pd.merge(df_student_test_math_data,df_student_test_science_data,on='student')
pd.merge(df_student_test_math_data,df_student_test_science_data,on='ID',how='right')
(pd.merge(df_student_test_math_data,df_student_test_science_data,on='ID',how='left')).fillna('X
')
pd.merge(df_student_test_math_data,df_student_test_science_data,on='ID',how='outer')
pd.concat([df_student_test_math_data,df_student_test_science_data],ignore_index=True)
df_student_survey_data = pd.DataFrame({'student':['Tom','Jack','Tom','Ram','Jeff','Jack'],
'ID':[10,56,10,85,9,56]
})
df_student_survey_data
df_student_survey_data.duplicated()
df_student_survey_data.drop_duplicates()
df_student_survey_data.drop_duplicates(['student'])
df_student_survey_data.drop_duplicates('ID')
#Database interaction with SQL

#import pandas library


import pandas as pd
#import sqllite
import sqlite3

#Create SQL table


create_SQL_table = """
CREATE TABLE student_test_score
(Id INTEGER, Name VARCHAR(20), Math REAL,
Science REAL
);"""

#execute the SQL statement


executeSQL = sqlite3.connect(':memory:')
executeSQL.execute(create_SQL_table)
executeSQL.commit()

#prepare a SQL query


SQL_query = executeSQL.execute('select * from student_test_score')

#fetch result from the SQLlite database


resultSet = SQL_query.fetchall()

#view result
resultSet

#prepare records to be inserted into SQL table through SQL statement


insertData_SQL = [(10,'Jack',85,92),
(29,'Tom',73,89),
(65,'Ram',65.5,77),
(5,'Steve',55,91)
]

#insert records into SQL table through SQL statement


insert_statement = "Insert into student_test_score values(?,?,?,?)"
executeSQL.executemany(insert_statement,insertData_SQL)
executeSQL.commit()

#prepare SQL query


SQL_query = executeSQL.execute('select * from student_test_score')

#fetch the resultset for the query


resultSet = SQL_query.fetchall()
#view the resultset
resultSet

#put the reconrds together in dataframe


df_student_records = pd.DataFrame(resultSet,columns=zip(*SQL_query.description)[0])

#view the records in pandas dataframe


df_student_records

#MISSING VALUES

import pandas as pd

#declare first series


first_series = pd.Series([1,2,3,4,5],index=['a','b','c','d','e'])

#declare second series


second_series=pd.Series([10,20,30,40,50],index=['c','e','f','g','h'])

sum_of_series = first_series+second_series

sum_of_series

# drop NaN( Not a Number) values from dataset


dropna_s = sum_of_series.dropna()

dropna_s

dropna_s.fillna(0)

# Fill NaN( Not a Number) values with Zeroes (0)


fillna_s = sum_of_series.fillna(0)

fillna_s

#fill values with zeroes before performing addition operation for missing indices
fill_NaN_with_zeros_before_sum =first_series.add(second_series,fill_value=0)

fill_NaN_with_zeros_before_sum

Start Pandas Series Exercises

Exercise 01 : Create simple series

import numpy as np
import pandas as pd
#print a simple series with list as an argument
first_series = pd.Series(list('abcdef'))
print (first_series)

Exercise 02 : Create Series from ndarray

#create a series using ndarray countries data


np_country = np.array(['Luxembourg','Norway','Japan','Switzerland','United
States','Qatar','Iceland','Sweden',
'Singapore','Denmark'])

s_country = pd.Series(np_country)
print (s_country)

Exercise 03 : Create Series from dict

#Evaluate countries and their corresponding gdp per capita and print them as series
dict_country_gdp =
pd.Series([52056.01781,40258.80862,40034.85063,39578.07441,39170.41371,
37958.23146,37691.02733,36152.66676,34706.19047,33630.24604,
33529.83052,30860.12808],index=['Luxembourg','Macao, China','Norway',
'Japan','Switzerland','Hong Kong, China','United States','Qatar','Iceland','Sweden',
'Singapore','Denmark'])

print (dict_country_gdp)

Exercise 04: Access elements in Series

#access elements in the series


dict_country_gdp[0]
#access first 5 countries from the series
dict_country_gdp[0:5]
#look up a country by name or index
dict_country_gdp.loc['United States']
#look up by position
dict_country_gdp.iloc[0]

Exercise 05 : Create Series from scalar

#Print Series with scalar input


scalar_series = pd.Series(5.,index=['a','b','c','d','e'])
scalar_series

Exercise 05 : Vectorized Operations


#declare two different vector series with same indexes
first_vector_series = pd.Series([1,2,3,4],index=['a','b','c','d'])
second_vector_series = pd.Series([10,20,30,40],index=['a','b','c','d'])

first_vector_series+second_vector_series
#now shuffle index of second vector series
second_vector_series = pd.Series([10,20,30,40],index=['a','d','b','c'])

first_vector_series+second_vector_series
#now replace few indexes with new ones in second vector series
second_vector_series = pd.Series([10,20,30,40],index=['a','b','e','f'])
first_vector_series+second_vector_series
Assignment 01 FAA
Analyse the Federal Aviation Authority Dataset using Pandas
DESCRIPTION

Problem:
Analyze the Federal Aviation Authority (FAA) dataset using Pandas to do the following:
1. View
 aircraft make name
 state name
 aircraft model name
 text information
 flight phase
 event description type
 fatal flag
         2. Clean the dataset and replace the fatal flag NaN with “No”
3. Find the aircraft types and their occurrences in the dataset
4. Remove all the observations where aircraft names are not available
5. Display the observations where fatal flag is “Yes”

#import necessary library


import pandas as pd

#read the faa (federal aviation authority) dataset


df_faa_dataset = pd.read_csv('C:\\dataset\\faa_ai_prelim.csv')

#view the dataset shape


df_faa_dataset.shape

#view the first five observations


df_faa_dataset.head()

#view all the columns present in the dataset


df_faa_dataset.columns

#now create a new data frame with only required columns


df_analyze_dataset=df_faa_dataset[['ACFT_MAKE_NAME','LOC_STATE_NAME','ACFT_M
ODEL_NAME','RMK_TEXT',
'FLT_PHASE','EVENT_TYPE_DESC','FATAL_FLAG']]

#view the type of the object


type(df_analyze_dataset)

#view first five observations


df_analyze_dataset.head()

#replace all NaN for Fatal flaf witth 'No'


df_analyze_dataset['FATAL_FLAG'].fillna(value='No',inplace=True)

#now view first five observations


df_analyze_dataset.head()

#view the shape of the dataset


df_analyze_dataset.shape

#drop values where ACFT_MAKE_NAME (aircraft make name) is not available


df_final_dataset = df_analyze_dataset.dropna(subset=['ACFT_MAKE_NAME'])

#now view the new shape of the dataset


df_final_dataset.shape

#group by aircraft name


aircraftType = df_final_dataset.groupby('ACFT_MAKE_NAME')

#view them by aircraft using size method


aircraftType.size()

#Now group the dataset by fatal flag


fatalAccidents = df_final_dataset.groupby('FATAL_FLAG')

#view the fatal accidents size


fatalAccidents.size()

#select the accidents with fatality with fatl flag yes


accidents_with_fatality = fatalAccidents.get_group('Yes')

#view the accidents with fatality


accidents_with_fatality

#FDNY Assignment 02:


Analyse NewYork city fire department Dataset
DESCRIPTION

What to:
A dataset in CSV format is given for the Fire Department of New York City. Analyze the dataset to
determine:
1. The total number of fire department facilities in New York city
2. The number of fire department facilities in each borough
3. The facility names in Manhattan
#import libraries
import pandas as pd

#read data from csv file fire department of New York City (FDNY)
df_fdny_csv_data_raw = pd.read_csv('C:\dataset\FDNY_Firehouse_Listing.csv')

#view content of the data


df_fdny_csv_data_raw.describe

#view first five records


df_fdny_csv_data_raw.head(5)

#skip the first row from dataset


df_fdny_csv_data = pd.read_csv('C:\dataset\FDNY_Firehouse_Listing.csv',skiprows=1)

#view first five records from fixed dataset


df_fdny_csv_data.head(5)

#view data statistics using describe()


df_fdny_csv_data.describe()

#view columns of the dataset


df_fdny_csv_data.columns

#view indexe of dataset


df_fdny_csv_data.index

#Count number of records


df_fdny_csv_data.count()

#view datatypes
df_fdny_csv_data.dtypes

#select FDNY information boroughwise


groupby_borough = df_fdny_csv_data.groupby('Borough')

#view FDNY informationn for each borough


groupby_borough.size()

#select FDNY information for Manhattan


fdny_info_Manhattan = groupby_borough.get_group('Manhattan')

#View FDNY information for Manhattan


fdny_info_Manhattan

You might also like