Import Libraries
Import Libraries
import warnings
warnings.filterwarnings('ignore')
Load Dataset
#set working directory
df = pd.read_csv("customer_location_df.txt", sep='|')
df.shape
(791776, 26)
Select Column
df = df[(df['GOO_COUNTRY_CODE']=='Hong Kong') & (df['PLC_STATUS']=='有效')]
df = df.loc[:,['GOO_LATITUDE','GOO_LONGITUDE']]
Outliner
df.loc[df['GOO_LATITUDE'] < 22]
GOO_LATITUDE GOO_LONGITUDE
403267 0.0 0.0
GOO_LATITUDE GOO_LONGITUDE
790967 0.0 0.0
df.loc[df['GOO_LONGITUDE'] > 115]
GOO_LATITUDE GOO_LONGITUDE
519918 28.64789 180.0
df.GOO_LATITUDE.loc[[403267]] = 22.4303384
df.GOO_LONGITUDE.loc[[403267]] = 114.0891702
df.GOO_LATITUDE.loc[[790967]] = 22.3445576
df.GOO_LONGITUDE.loc[[790967]] = 114.1882836
df.GOO_LATITUDE.loc[[519918]] = 22.3975032
df.GOO_LONGITUDE.loc[[519918]] = 113.9760157
plt.scatter(df['GOO_LONGITUDE'], df['GOO_LATITUDE'], c='green', s=7)
<matplotlib.collections.PathCollection at 0x7f29bf7dd550>
# Iterate over the data points and calculate the distance using the
# given formula
for x, y in list(zip(X, Y)):
root_diff_x = (x - c_x) ** 2
root_diff_y = (y - c_y) ** 2
distance = np.sqrt(root_diff_x + root_diff_y)
distances.append(distance)
return distances
# Calculate the distance and assign them to the DataFrame accordingly
df['C1_Distance'] = calculate_distance(c1, df['GOO_LONGITUDE'],
df['GOO_LATITUDE'])
df['C2_Distance'] = calculate_distance(c2, df['GOO_LONGITUDE'],
df['GOO_LATITUDE'])
df['C3_Distance'] = calculate_distance(c3, df['GOO_LONGITUDE'],
df['GOO_LATITUDE'])
df['C4_Distance'] = calculate_distance(c4, df['GOO_LONGITUDE'],
df['GOO_LATITUDE'])
# Get the minimum distance centroids
df['Cluster'] = df[['C1_Distance', 'C2_Distance', 'C3_Distance',
'C4_Distance']].apply(np.argmin, axis =1)
df['Cluster'] = df['Cluster'].map({'C1_Distance': 'C1', 'C2_Distance': 'C2',
'C3_Distance': 'C3', 'C4_Distance': 'C4'})
plt.scatter(df['GOO_LONGITUDE'], df['GOO_LATITUDE'], c=df['Cluster'], s=5)
plt.scatter(c1[0], c1[1], marker='*', s=100, c='yellow')
plt.scatter(c2[0], c2[1], marker='*', s=100, c='red')
plt.scatter(c3[0], c3[1], marker='*', s=100, c='green')
plt.scatter(c4[0], c4[1], marker='*', s=100, c='blue')
<matplotlib.collections.PathCollection at 0x7f29b5cdb650>