Final DAA
Final DAA
& ENGG
( Bhilai Institute of Technology, Durg )
CERTIFICATE OF COMPLETION
Approved By:
try:
except FileNotFoundError:
print(f"The file {filename} was not found.")
except IOError:
print(f"Error occurred while accessing the file {filename}.")
except Exception as e:
print(f"An unexpected error occurred: {e}")
finally:
# Closing the file explicitly
try:
1
file.close()
print("File closed successfully.")
except NameError:
# File was never opened, no need to close
pass
except Exception as e:
print(f"An error occurred while closing the file: {e}")
[ ]: print(arr)
[1 2 3 4 5]
[ ]: array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
[ ]: arr_random = np.random.rand(3, 3)
arr_random
[ ]: arr1d + 10
[ ]: arr1d * 2
2
[ ]: array([ 2, 4, 6, 8, 10])
[ ]: np.sqrt(arr1d)
[ ]: np.exp(arr1d)
[ ]: arr2d.ndim, arr2d.size
[ ]: (2, 6)
[ ]: a = np.random.rand(3, 3)
[ ]: a,b
[ ]: (array([[5, 3],
[5, 2]]),
array([[3, 3],
[2, 7]]))
[ ]: array([[3, 5],
[2, 5]])
[ ]: a[1][1] #Indexing
[ ]: 2
[ ]: # Splitting
split_arr = np.split(a, [1]) # Split at indices 1
3
print("Split array:",split_arr)
[ ]: x = np.stack((a, b))
x
[ ]: array([[[5, 3],
[5, 2]],
[[3, 3],
[2, 7]]])
[ ]: print("addition\n", a + b)
print("subbtraction\n", a + b)
print("division\n", a + b)
print("multiply 1\n", a * b)
print("multiply 2\n", a @ b)
addition
[[8 6]
[7 9]]
subbtraction
[[8 6]
[7 9]]
division
[[8 6]
[7 9]]
multiply 1
[[15 9]
[10 14]]
multiply 2
[[21 36]
[19 29]]
[ ]: c = np.random.rand(4, 4)
[ ]: c
4
[ ]: array([[0.38479275, 0.84841358],
[0.51642219, 0.71366856]])
[ ]: c[0:-2, 0:-2]
[ ]: array([[0.38479275, 0.84841358],
[0.51642219, 0.71366856]])
[ ]: print("mean\n", np.mean(a))
print("sum\n", np.sum(a))
print("min\n", np.min(a))
print("max 1\n", np.max(a))
print("cumsum 2\n", np.cumsum(a))
mean
3.75
sum
15
min
2
max 1
5
cumsum 2
[ 5 8 13 15]
[ ]: a
[ ]: array([[5, 3],
[5, 2]])
[ ]: a
[ ]: array([[5, 3],
[5, 2]])
[ ]: a.T
[ ]: array([[5, 5],
[3, 2]])
[ ]: array([[5],
[3],
[5],
[2]])
5
[ ]: a.max(axis = 0)
[ ]: array([5, 3])
[ ]: x.ndim, x.size
[ ]: (3, 8)
#LAB -3#
[ ]: import pandas as pd
data = pd.read_csv("/content/gapminder-FiveYearData.csv")
[ ]: #Sorting
data.sort_values(by=["gdpPercap"]).head(5)
[ ]: #filtering
data_2007 = data[data["year"] == 2007]
data_2007.head(5)
[ ]: max_gdp = max(data["gdpPercap"])
country = data[data["gdpPercap"] == max_gdp]
country # country with max gdp Per Capita
[ ]: year_wise_lifeExp_dict = {}
years = data["year"]
for year in years:
x = data[data["year"] == year].lifeExp.mean()
year_wise_lifeExp_dict[year] = x
6
year_wise_lifeExp = pd.Series(year_wise_lifeExp_dict)
[ ]: year_wise_lifeExp
[ ]: 1952 49.057620
1957 51.507401
1962 53.609249
1967 55.678290
1972 57.647386
1977 59.570157
1982 61.533197
1987 63.212613
1992 64.160338
1997 65.014676
2002 65.694923
2007 67.007423
dtype: float64
[ ]: year_wise_lifeExp_sr = data.groupby("year")["lifeExp"].mean()
year_wise_lifeExp_sr
[ ]: year
1952 49.057620
1957 51.507401
1962 53.609249
1967 55.678290
1972 57.647386
1977 59.570157
1982 61.533197
1987 63.212613
1992 64.160338
1997 65.014676
2002 65.694923
2007 67.007423
Name: lifeExp, dtype: float64
#LAB-4#
[ ]: import matplotlib.pyplot as plt
import numpy as np
x = [10,20,25,15]
y = [5,13,6,7]
data = np.random.randn(1000)
7
# Working with Multiple Figures and Axes
# Subplots
fig, axs = plt.subplots(2, 2, figsize=(10, 8))
axs[0,0].legend() ␣
↪# Adding Legend
axs[0, 1].set_title('Histogram')
plt.tight_layout()
# Saving the chart
plt.savefig('figure chart.png')
plt.show()
8
#LAB-5#
[ ]: import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
9
sns.regplot(data=tips, x="total_bill", y="tip_percentage",ax = axes[2])
plt.title('Regression Plot of Total Bill vs. Tip Percentage')
#LAB-6#
[ ]: import numpy as np
import pandas as pd
[ ]: data.head()
10
[ ]: CREATED_DATE CREATED_DATE minus Hour \
0 2016-01-09 00:18:14 2016-01-09
1 2016-01-09 02:28:34 2016-01-09
2 2016-01-09 04:00:34 2016-01-09
3 2016-01-09 10:26:27 2016-01-09
4 2016-01-09 11:37:59 2016-01-09
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[ ]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CREATED_DATE 10000 non-null datetime64[ns]
1 CREATED_DATE minus Hour 10000 non-null datetime64[ns]
2 USER_ID 10000 non-null object
3 TRANSACTION_ID 10000 non-null object
4 TYPE 10000 non-null object
5 CURRENCY 10000 non-null object
6 AMOUNT 10000 non-null float64
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 547.0+ KB
[ ]: data.describe()
11
max 2017-01-08 23:50:18 2017-01-08 00:00:00 349.980000
std NaN NaN 101.406464
[ ]: data["year"] = pd.DatetimeIndex(data.CREATED_DATE).year
data["month"] = pd.DatetimeIndex(data.CREATED_DATE).month
data["weekdays"] = pd.DatetimeIndex(data.CREATED_DATE).weekday
[ ]: EUR = []
for i in range(len(data)):
if data.iloc[i]["CURRENCY"] == "EUR":
EUR.append(data.iloc[i]["AMOUNT"])
else:
EUR.append(data.iloc[i]["AMOUNT"] * 1.17)
data["AMT_EUR"] = EUR
[ ]: data.head()
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[ ]: data[["TYPE"]].value_counts()
[ ]: TYPE
TOPUP 2373
BANK_TRANSFER 2371
ATM 2357
12
CARD_PAYMENT 2325
P2P_TRANSFER 574
Name: count, dtype: int64
#LAB-7#
[ ]: import numpy as np
import pandas as pd
data = pd.read_csv("/content/Titanic.csv")
[ ]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 pclass 1309 non-null int64
1 survived 1309 non-null int64
2 name 1309 non-null object
3 sex 1309 non-null object
4 age 1046 non-null float64
5 sibsp 1309 non-null int64
6 parch 1309 non-null int64
7 ticket 1309 non-null object
8 fare 1308 non-null float64
9 cabin 295 non-null object
10 embarked 1307 non-null object
11 boat 486 non-null object
12 body 121 non-null float64
13 home.dest 745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB
[ ]: data.head()
[ ]: data.describe()
13
[ ]: pclass survived sex age sibsp \
count 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000
mean 2.294882 0.381971 0.644003 29.881138 0.498854
std 0.837836 0.486055 0.478997 12.883193 1.041658
min 1.000000 0.000000 0.000000 0.170000 0.000000
25% 2.000000 0.000000 0.000000 22.000000 0.000000
50% 3.000000 0.000000 1.000000 29.881138 0.000000
75% 3.000000 1.000000 1.000000 35.000000 1.000000
max 3.000000 1.000000 1.000000 80.000000 8.000000
[ ]: data.head()
[ ]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 pclass 1309 non-null int64
1 survived 1309 non-null int64
14
2 sex 1309 non-null int64
3 age 1309 non-null float64
4 sibsp 1309 non-null int64
5 parch 1309 non-null int64
6 fare 1309 non-null float64
7 embarked 1309 non-null int64
8 body 121 non-null float64
dtypes: float64(3), int64(6)
memory usage: 92.2 KB
[ ]: data.replace({'sex':{'male':1,'female':0}},inplace=True)
[ ]: data.replace({'embarked':{'S':2 ,'C':1,'Q':0}},inplace=True)
[ ]: data.corr()
[ ]:
[ ]:
[ ]:
[ ]:
#LAB-8#
15
[ ]: import numpy as np
import pandas as pd
[ ]: data.head()
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[ ]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CREATED_DATE 10000 non-null datetime64[ns]
1 CREATED_DATE minus Hour 10000 non-null datetime64[ns]
2 USER_ID 10000 non-null object
3 TRANSACTION_ID 10000 non-null object
4 TYPE 10000 non-null object
5 CURRENCY 10000 non-null object
6 AMOUNT 10000 non-null float64
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 547.0+ KB
16
[ ]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns
gdp_missing_values_data = pd.read_csv('./Datasets/GDP_missing_data.csv')
gdp_complete_data = pd.read_csv('./Datasets/GDP_complete_data.csv')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-1-a1d39de8ca53> in <cell line: 6>()
4 import sklearn as sk
5 import seaborn as sns
----> 6 gdp_missing_values_data = pd.read_csv('./Datasets/GDP_missing_data.csv')
7 gdp_complete_data = pd.read_csv('./Datasets/GDP_complete_data.csv')
/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in␣
↪read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,␣
↪usecols, dtype, engine, converters, true_values, false_values,␣
↪skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na,␣
↪na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format,␣
↪keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator,␣
↪chunksize, compression, thousands, decimal, lineterminator, quotechar,␣
↪quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect,␣
↪on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision,␣
↪storage_options, dtype_backend)
910 kwds.update(kwds_defaults)
911
--> 912 return _read(filepath_or_buffer, kwds)
913
914
/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in␣
↪_read(filepath_or_buffer, kwds)
575
576 # Create the parser.
--> 577 parser = TextFileReader(filepath_or_buffer, **kwds)
578
579 if chunksize or iterator:
/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in␣
↪__init__(self, f, engine, **kwds)
1405
1406 self.handles: IOHandles | None = None
-> 1407 self._engine = self._make_engine(f, self.engine)
1408
1409 def close(self) -> None:
/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py in␣
↪_make_engine(self, f, engine)
17
1659 if "b" not in mode:
1660 mode += "b"
-> 1661 self.handles = get_handle(
1662 f,
1663 mode,
/usr/local/lib/python3.10/dist-packages/pandas/io/common.py in␣
↪get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text,␣
↪errors, storage_options)
[ ]: data.describe()
[ ]: data["year"] = pd.DatetimeIndex(data.CREATED_DATE).year
data["month"] = pd.DatetimeIndex(data.CREATED_DATE).month
data["weekdays"] = pd.DatetimeIndex(data.CREATED_DATE).weekday
[ ]: EUR = []
for i in range(len(data)):
if data.iloc[i]["CURRENCY"] == "EUR":
EUR.append(data.iloc[i]["AMOUNT"])
else:
EUR.append(data.iloc[i]["AMOUNT"] * 1.17)
data["AMT_EUR"] = EUR
[ ]: data.head()
18
[ ]: CREATED_DATE CREATED_DATE minus Hour \
0 2016-01-09 00:18:14 2016-01-09
1 2016-01-09 02:28:34 2016-01-09
2 2016-01-09 04:00:34 2016-01-09
3 2016-01-09 10:26:27 2016-01-09
4 2016-01-09 11:37:59 2016-01-09
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[ ]: data[["TYPE"]].value_counts()
[ ]: TYPE
TOPUP 2373
BANK_TRANSFER 2371
ATM 2357
CARD_PAYMENT 2325
P2P_TRANSFER 574
Name: count, dtype: int64
[ ]: data["year"].unique()
[ ]: data.groupby(["CURRENCY"])["AMOUNT"].sum()
[ ]: CURRENCY
EUR 852363.35
GBP 905319.18
Name: AMOUNT, dtype: float64
19
2 EUR 22249.70
GBP 26937.35
3 EUR 44099.57
GBP 45814.22
4 EUR 43964.14
GBP 45241.07
5 EUR 49489.32
GBP 51630.61
6 EUR 53965.12
GBP 58219.62
7 EUR 81995.70
GBP 82271.76
8 EUR 100820.63
GBP 114643.94
9 EUR 90419.37
GBP 95699.41
10 EUR 101629.15
GBP 115582.59
11 EUR 105934.72
GBP 105177.93
12 EUR 110733.82
GBP 112710.05
2017 1 EUR 27446.69
GBP 31235.29
Name: AMOUNT, dtype: float64
[ ]: data.groupby(["weekdays", "CURRENCY"])["AMOUNT"].sum()
[ ]: weekdays CURRENCY
0 EUR 107370.90
GBP 129305.04
1 EUR 125032.02
GBP 118797.33
2 EUR 121888.83
GBP 129554.67
3 EUR 119865.46
GBP 131812.35
4 EUR 138228.10
GBP 150998.18
5 EUR 132238.72
GBP 135012.44
6 EUR 107739.32
GBP 109839.17
Name: AMOUNT, dtype: float64
[ ]: data.groupby(["TYPE", "CURRENCY"])["AMOUNT"].sum()
20
[ ]: TYPE CURRENCY
ATM EUR 213140.45
GBP 198558.25
BANK_TRANSFER EUR 205127.11
GBP 213737.72
CARD_PAYMENT EUR 210115.77
GBP 204736.58
P2P_TRANSFER EUR 19905.82
GBP 82075.52
TOPUP EUR 204074.20
GBP 206211.11
Name: AMOUNT, dtype: float64
[ ]: data.groupby(["weekdays"])["AMT_EUR"].sum().plot()
[ ]: <Axes: xlabel='weekdays'>
[ ]: data.groupby(["USER_ID"])["TRANSACTION_ID"].count().sort_values(ascending =␣
↪False)
21
[ ]: USER_ID
06bb2d68-bf61-4030-8447-9de64d3ce490 132
d35f19f3-d9ad-48bf-bd1e-90f3ba4f0b98 103
d1bc3cd6-154e-479f-8957-a69cdf414462 95
0fe472c9-cf3e-4e43-90f3-a0cfb6a4f1f0 85
65ac0928-e17d-4636-96f4-ebe6bdb9c98d 84
…
dcf8d6c6-9fb6-4b0b-a190-013d220b33d7 1
2d6259b3-5a22-4b4b-b616-c22d9d7677c2 1
2d518cf9-d853-443d-a3d8-bda56f373901 1
5a99fa7a-72e5-4dbe-ae51-f0fd3bc8a717 1
2588d6c8-1a2e-4a54-a191-3b3111f9658e 1
Name: TRANSACTION_ID, Length: 1134, dtype: int64
#LAB-9#
[ ]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
[ ]: data.head()
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[ ]: data.info()
22
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CREATED_DATE 10000 non-null datetime64[ns]
1 CREATED_DATE minus Hour 10000 non-null datetime64[ns]
2 USER_ID 10000 non-null object
3 TRANSACTION_ID 10000 non-null object
4 TYPE 10000 non-null object
5 CURRENCY 10000 non-null object
6 AMOUNT 10000 non-null float64
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 547.0+ KB
[ ]: data.describe()
[ ]: data[["TYPE"]].value_counts()
[ ]: TYPE
TOPUP 2373
BANK_TRANSFER 2371
ATM 2357
CARD_PAYMENT 2325
P2P_TRANSFER 574
Name: count, dtype: int64
[ ]: data[["AMOUNT"]].mean()
[ ]: AMOUNT 175.768253
dtype: float64
[ ]: data[["AMOUNT"]].median()
[ ]: AMOUNT 177.455
dtype: float64
23
[ ]: data[["AMOUNT"]].mode()
[ ]: AMOUNT
0 124.01
[ ]: data[["AMOUNT"]].std()
[ ]: AMOUNT 101.406464
dtype: float64
[ ]: data[["AMOUNT"]].gt(200).mean()
[ ]: AMOUNT 0.4322
dtype: float64
[ ]: data["AMOUNT"].unique()
[ ]: data['AMOUNT'].value_counts()
[ ]: AMOUNT
124.01 6
140.59 4
284.25 4
53.96 3
13.63 3
..
292.14 1
52.09 1
110.65 1
307.05 1
228.90 1
Name: count, Length: 8746, dtype: int64
plt.hist(data['AMOUNT'], bins=10)
plt.xlabel('AMOUNT')
plt.ylabel('Probability')
plt.title('Probability distribution of AMOUNT')
plt.show()
24
[ ]: # Create a sampling distribution
sample = data['AMOUNT'].sample(100, replace=True)
plt.hist(sample, bins=10)
plt.xlabel('AMOUNT')
plt.ylabel('Probability')
plt.title('Sampling distribution of AMOUNT')
plt.show()
25
#LAB 10#
[1]: import numpy as np
import pandas as pd
[2]: data.head()
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
26
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[3]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CREATED_DATE 10000 non-null datetime64[ns]
1 CREATED_DATE minus Hour 10000 non-null datetime64[ns]
2 USER_ID 10000 non-null object
3 TRANSACTION_ID 10000 non-null object
4 TYPE 10000 non-null object
5 CURRENCY 10000 non-null object
6 AMOUNT 10000 non-null float64
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 547.0+ KB
[4]: data.describe()
[6]: EUR = []
for i in range(len(data)):
if data.iloc[i]["CURRENCY"] == "EUR":
27
EUR.append(data.iloc[i]["AMOUNT"])
else:
EUR.append(data.iloc[i]["AMOUNT"] * 1.17)
data["AMT_EUR"] = EUR
[7]: data.head()
USER_ID TRANSACTION_ID \
0 45e3c222-38ac-4fdb-b092-ff1639e4438c 27d7fd11-d885-4d2c-9ed1-daa89b7bda1d
1 57c11728-b979-4856-bada-1d268726cfe9 2e1ee26c-0d24-4931-a7f9-0caa0d07eb2e
2 1319cca9-02a7-4a15-8abb-48d4e08e5aa3 bfd20e6f-ddb3-4237-bcd2-f7f8d967e36e
3 3f6bb28c-f945-4027-9178-747956c3ea58 85037186-039a-4ae5-9fea-e87f30822218
4 f54baeeb-7282-4d23-9bb7-e8396ce1b159 8e1e938a-1916-4d5e-b261-82c61a6979d6
[8]: data[["TYPE"]].value_counts()
[8]: TYPE
TOPUP 2373
BANK_TRANSFER 2371
ATM 2357
CARD_PAYMENT 2325
P2P_TRANSFER 574
Name: count, dtype: int64
Hypothesis: The top 3% users drive 20% value tax Same as bottom 60% of user. For both EUR
and GBP
[9]: top_users = data.groupby(["USER_ID"])["AMT_EUR"].sum().sort_values(ascending =␣
↪False)
bottom_users = data.groupby(["USER_ID"])["AMT_EUR"].sum().sort_values()
28
[10]: top_users_count = len(top_users)
29