-
Notifications
You must be signed in to change notification settings - Fork 0
/
Classifier.py
170 lines (136 loc) · 6.28 KB
/
Classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import argparse
import os
import sys
import pandas as pd
from nltk import word_tokenize
from scipy import sparse
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from embeddings.embedding_vectorizer import EmbeddingVectorizer
from loggers.ClassifierLogger import ClassifierLogger
from FeatureManager import FeatureManager
from itertools import chain, combinations
FEATURES = ['d2v', 'link', 'authors', 'headline', 'short_description']
CLEANSE = True
MIN_COMBINATION_SIZE = len(FEATURES)
EMBEDDINGS_CORPUS = None
ALGORITHMS = ["Support Vector Machine"]
TEST_SIZE = 0.2
# ["Naive Bayes", "Decision Tree", "Adaboost", "Support Vector Machine", "Random Forest", "Gradient Descent"]
class Classifier:
def __init__(self, feature_manager, algo_list, logger):
self.logger = logger
self.data = feature_manager.data
self.feature_manager = feature_manager
self.algo_list = algo_list
@staticmethod
def get_classifier(algo):
if algo == "Gradient Boost":
return GradientBoostingClassifier()
elif algo == "Random Forest":
return RandomForestClassifier()
elif algo == "Adaboost":
return AdaBoostClassifier()
elif algo == "Decision Tree":
return DecisionTreeClassifier()
elif algo == "Naive Bayes":
return BernoulliNB()
elif algo == "Gradient Descent":
return SGDClassifier()
elif algo == "Support Vector Machine":
return LinearSVC()
elif algo == "MLPC":
# NEURAL
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=True,
warm_start=False)
return 0
def train(self):
# Train the model
X = self.feature_manager.features
y_1 = self.data['category'].values.tolist()
# Split dataset into training set and test set
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y_1, test_size=TEST_SIZE,
random_state=2)
# Instantiate Classifying Algorithm
for algo in self.algo_list:
self.clf = self.get_classifier(algo)
self.logger.info(algo)
self.logger.info("====")
self.logger.info("Training...")
clf = self.clf.fit(self.X_train, self.y_train)
self.logger.info("Trained!")
self.logger.info("Evaluating...")
self.evaluate(clf)
self.logger.info("Evaluated!")
def evaluate(self, clf):
# Predict and evaluate the response for test dataset
y_pred = clf.predict(self.X_test)
self.logger.info("-> Predicted: {}".format(y_pred))
self.logger.info("-> Accuracy: {}".format(accuracy_score(self.y_test, y_pred)))
if __name__ == "__main__":
''' Definition of command line arguments for ArgumentParser '''
parser = argparse.ArgumentParser(description='Runs Classifier')
parser.add_argument('--cleanse', action='store_true', dest='cleanse', default=False)
parser.add_argument('--features', action='store', nargs='+', dest='features', default=FEATURES)
parser.add_argument('--algo', action='store', nargs='+', dest='algo', default=ALGORITHMS)
parser.add_argument('--min_feat_size', action='store', dest='min_comb_size', type=int, default=MIN_COMBINATION_SIZE)
parser.add_argument('--embeddings', action='store', dest='embeddings', default=None)
parser.add_argument('--test_size', action='store', type=float, dest='test_size', default=TEST_SIZE)
''' Parsing of arguments from command line'''
args = parser.parse_args(sys.argv[1:])
''' Configuration of parameters to be overwriten '''
if args.cleanse:
CLEANSE = args.cleanse
if args.features:
FEATURES = args.features
if args.algo:
ALGORITHMS = args.algo
if args.min_comb_size:
MIN_COMBINATION_SIZE = args.min_comb_size
if args.embeddings:
EMBEDDINGS_CORPUS = args.embeddings
if args.test_size:
TEST_SIZE = args.test_size
# Instantiate Logger class
logger = ClassifierLogger().get_logger()
# Get the corpus file
dir_path = os.path.dirname(os.path.realpath(__file__))
corpus = os.path.join(dir_path, "resources", "News_category_train.json")
data = pd.read_json(corpus)
# I get combinations of features to see if they are relevant
combinations = chain(*map(lambda x: combinations(FEATURES, x), range(0, len(FEATURES) + 1)))
embeddings = None
if 'd2v' in FEATURES and EMBEDDINGS_CORPUS is not None:
# Get the embeddings
logger.info("Loading embeddings...")
model = EmbeddingVectorizer(os.path.join(dir_path, 'embeddings', EMBEDDINGS_CORPUS, 'doc2vec.bin'))
logger.info("Creating d2v feature sparse matrix")
rows = []
for idx, element in enumerate(data['headline']):
v_string = [t for t in word_tokenize(element)]
vec = model.infer_vector(v_string, 0.01, 1000)
rows.append(vec)
if idx % 1000 == 0:
logger.info("Processed {} rows".format(idx))
logger.info("Finished. Rows: {}".format(len(rows)))
embeddings = sparse.csr_matrix(rows)
# Train and evaluate all combinations
logger.info("Cleasing active? {}".format(CLEANSE))
logger.info("Embeddings active? {}".format('d2v' in FEATURES and EMBEDDINGS_CORPUS is not None))
for combination in combinations:
if len(combination) < MIN_COMBINATION_SIZE:
continue
feat_manager = FeatureManager(data, combination, logger, CLEANSE, embeddings)
if feat_manager.features is None:
continue
classifierNB = Classifier(feat_manager, ALGORITHMS, logger)
classifierNB.train()