Python Comcast Telecom Consumer Complaints Final Project
Python Comcast Telecom Consumer Complaints Final Project
import pandas as pd
import matplotlib.pyplot as plt
In [2]: df = pd.read_csv('C:\\Users\\sathi\\OneDrive\\Desktop\\SIMPLILEARN\\PYTHON\\PROJECTS
In [3]: df.head(3)
Out[3]:
Ticket Customer Received Zip
Date Date_month_year Time City State St
# Complaint Via code
Comcast
22-
Cable 3:53:50 Customer
0 250635 04- 22-Apr-15 Abingdon Maryland 21009 Cl
Internet PM Care Call
15
Speeds
Payment
04-
disappear - 10:22:56
1 223441 08- 04-Aug-15 Internet Acworth Georgia 30102 Cl
service got AM
15
disconnected
18-
Speed and 9:55:47
2 242732 04- 18-Apr-15 Internet Acworth Georgia 30101 Cl
Service AM
15
In [6]: df.dtypes
In [7]: df = df.set_index(df["date_index"])
In [8]: df.head(3)
Out[8]:
Ticket Customer Received
Date Date_month_year Time City State
# Complaint Via
date_index
Ticket Customer Received
Date Date_month_year Time City State
# Complaint Via
date_index
Comcast
2015-04- 22-
Cable 3:53:50 Customer
22 250635 04- 2015-04-22 Abingdon Maryland
Internet PM Care Call
15:53:50 15
Speeds
Payment
2015-08- 04-
disappear - 10:22:56
04 223441 08- 2015-08-04 Internet Acworth Georgia
service got AM
10:22:56 15
disconnected
2015-04- 18-
Speed and 9:55:47
18 242732 04- 2015-04-18 Internet Acworth Georgia
Service AM
09:55:47 15
In [9]: df["Date_month_year"].value_counts()[:3]
In [10]: df["Date_month_year"].value_counts().plot();
In [11]: f = df.groupby(pd.Grouper(freq="M")).size()
In [12]: f.head()
Out[12]: date_index
2015-01-31 55
2015-02-28 59
2015-03-31 45
2015-04-30 375
2015-05-31 317
Freq: M, dtype: int64
In [13]: df.groupby(pd.Grouper(freq="M")).size().plot()
Out[13]: <AxesSubplot:xlabel='date_index'>
In [14]: df.Status.unique()
In [16]: df.head(3)
Out[16]:
Ticket Customer Received
Date Date_month_year Time City State
# Complaint Via
date_index
Comcast
2015-04- 22-
Cable 3:53:50 Customer
22 250635 04- 2015-04-22 Abingdon Maryland
Internet PM Care Call
15:53:50 15
Speeds
Payment
2015-08- 04-
disappear - 10:22:56
04 223441 08- 2015-08-04 Internet Acworth Georgia
service got AM
10:22:56 15
disconnected
2015-04- 18-
Speed and 9:55:47
18 242732 04- 2015-04-18 Internet Acworth Georgia
Service AM
09:55:47 15
In [17]: df.groupby(["State"]).size().sort_values(ascending=False).to_frame().reset_index().r
0 Georgia 288
1 Florida 240
2 California 220
3 Illinois 164
4 Tennessee 143
State
State
Out[19]: <AxesSubplot:ylabel='State'>
In [20]: df.groupby(["State"]).size().sort_values(ascending=False).to_frame().reset_index().r
print('Comcast:', get_simple_topic_percentage('comcast'))
print('Data cap:', get_simple_topic_percentage('data'))
print('Speed:', get_simple_topic_percentage('speed'))
print('Internet:', get_simple_topic_percentage('internet'))
print('Price:', get_simple_topic_percentage('price'))
print('Bill:', get_simple_topic_percentage('bill'))
print('Customer Service:', get_simple_topic_percentage('customer service'))
Comcast: 56.0251798561151
Data cap: 9.847122302158272
Speed: 8.633093525179856
Internet: 23.92086330935252
Price: 2.652877697841727
Bill: 17.04136690647482
Customer Service: 3.507194244604316
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
## Package Plan ##
Note: you may need to restart the kernel to use updated packages.
In [44]: Num_Topic = 9
ldamodel = LdaModel(doc_term_matrix, num_topics= Num_Topic, id2word= dictionary, pas
In [46]: word_dict = {}
for i in range(Num_Topic):
words = ldamodel.show_topic(i, topn =20)
word_dict["Topic # " + "{}".format(i)] = [i[0] for i in words]
In [47]: pd.DataFrame(word_dict)
In [ ]: