0% found this document useful (0 votes)
12 views10 pages

Main Py

The document outlines a process for building a hate speech detection model using a dataset of Marathi text. It includes steps for data cleaning, preprocessing, feature extraction with TF-IDF, and training logistic regression models for binary and multi-class classification. The results are evaluated using classification reports and confusion matrices, and the models are saved for future use.

Uploaded by

niraj21it
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views10 pages

Main Py

The document outlines a process for building a hate speech detection model using a dataset of Marathi text. It includes steps for data cleaning, preprocessing, feature extraction with TF-IDF, and training logistic regression models for binary and multi-class classification. The results are evaluated using classification reports and confusion matrices, and the models are saved for future use.

Uploaded by

niraj21it
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

import pandas as pd

import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

file_path = '/content/niraj data set - Sheet1.csv'


df = pd.read_csv(file_path)
df.head()

df.info()
df.isnull().sum()
df.dropna(inplace=True)

label_counts = df['Label'].value_counts()
print("\nNumber of data rows for each main label:")
for label, count in label_counts.items():
label_name = "Hate Speech" if label == 1 else "Not Hate Speech"
print(f"{label_name} ({label}): {count}")

sublabel_counts = df['Subclass'].value_counts()
print("\nNumber of data rows for each sublabel:")
sublabel_mapping = {
1: "Defamatory/Insulting",
2: "Arrogative",
3: "Harassing",
4: "Gender Abusive",
5: "Racist",
6: "Religious Intolerant",
7: "Homophobic",
8: "Xenophobic",
9: "Disability Discriminatory",
10: "Classist",
0: "Not Hate Speech"
}
for sublabel, count in sublabel_counts.items():
sublabel_name = sublabel_mapping.get(sublabel, "Unknown")
print(f"{sublabel_name} ({sublabel}): {count}")

def clean_marathi_text(text):
marathi_only = re.sub(r'[^\u0900-\u097F\s]', '', text)
marathi_only = re.sub(r'\s+', ' ', marathi_only).strip()
return marathi_only

df['cleaned_text'] = df['Text'].apply(clean_marathi_text)
df[['Text', 'cleaned_text']].head()

marathi_stopwords = set(stopwords.words('/content/marathi_stopwords.txt'))

def remove_stopwords(text):
return ' '.join([word for word in text.split() if word not in marathi_stopwords])

df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)
df[['cleaned_text']].head()

X = df['cleaned_text']
y_label = df['Label']
y_subclass = df['Subclass']

X_train, X_test, y_label_train, y_label_test, y_subclass_train, y_subclass_test = train_test_split(


X, y_label, y_subclass, test_size=0.2, random_state=42
)

print(f'Training set size: {X_train.shape[0]}')


print(f'Testing set size: {X_test.shape[0]}')

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f'Train TF-IDF shape: {X_train_tfidf.shape}')


print(f'Test TF-IDF shape: {X_test_tfidf.shape}')

lr_label = LogisticRegression(max_iter=1000)
lr_label.fit(X_train_tfidf, y_label_train)

y_label_pred = lr_label.predict(X_test_tfidf)

print("Binary Classification (Label) Report:\n", classification_report(y_label_test, y_label_pred))


print("Accuracy for Label:", accuracy_score(y_label_test, y_label_pred))
lr_subclass = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
lr_subclass.fit(X_train_tfidf, y_subclass_train)

y_subclass_pred = lr_subclass.predict(X_test_tfidf)

print("Multi-class Classification (Subclass) Report:\n", classification_report(y_subclass_test,


y_subclass_pred))
print("Accuracy for Subclass:", accuracy_score(y_subclass_test, y_subclass_pred))

y_label_pred = lr_label.predict(X_test_tfidf)
y_subclass_pred = lr_subclass.predict(X_test_tfidf)

results = pd.DataFrame({
'Text': X_test,
'Predicted_Label': y_label_pred,
'Actual_Label': y_label_test,
'Predicted_Subclass': y_subclass_pred,
'Actual_Subclass': y_subclass_test
})

results.head()

import pickle

with open('lr_label_model.pkl', 'wb') as label_file:


pickle.dump(lr_label, label_file)

with open('lr_subclass_model.pkl', 'wb') as subclass_file:


pickle.dump(lr_subclass, subclass_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:


pickle.dump(tfidf_vectorizer, vectorizer_file)

import pickle
import re

with open('lr_label_model.pkl', 'rb') as label_file:


lr_label = pickle.load(label_file)

with open('lr_subclass_model.pkl', 'rb') as subclass_file:


lr_subclass = pickle.load(subclass_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:


tfidf_vectorizer = pickle.load(vectorizer_file)
def clean_marathi_text(text):
marathi_only = re.sub(r'[^\u0900-\u097F\s]', '', text)
marathi_only = re.sub(r'\s+', ' ', marathi_only).strip()
return marathi_only

def predict_hate_speech(sentence):
cleaned_sentence = clean_marathi_text(sentence)
sentence_tfidf = tfidf_vectorizer.transform([cleaned_sentence])
predicted_label = lr_label.predict(sentence_tfidf)[0]
predicted_subclass = lr_subclass.predict(sentence_tfidf)[0]
return predicted_label, predicted_subclass

example_sentence = "तम् ु ही एकदम बेजबाबदार आहात!"


label, subclass = predict_hate_speech(example_sentence)

print(f"Predicted Label (Hate Speech or Not): {label}")


print(f"Predicted Subclass (Type of Hate Speech): {subclass}")

import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.metrics import confusion_matrix

cm_label = confusion_matrix(y_label_test, y_label_pred)


plt.figure(figsize=(8, 6))
sns.heatmap(cm_label, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Hate Speech', 'Hate
Speech'], yticklabels=['Not Hate Speech', 'Hate Speech'])
plt.title('Confusion Matrix for Binary Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

cm_subclass = confusion_matrix(y_subclass_test, y_subclass_pred)


plt.figure(figsize=(12, 8))
sns.heatmap(cm_subclass, annot=True, fmt='d', cmap='Greens', xticklabels=range(11),
yticklabels=range(11))
plt.title('Confusion Matrix for Multi-class Classification (Subclass)')
plt.xlabel('Predicted Subclass')
plt.ylabel('Actual Subclass')
plt.show()

report_label = classification_report(y_label_test, y_label_pred, output_dict=True)


report_subclass = classification_report(y_subclass_test, y_subclass_pred, output_dict=True)
report_label_df = pd.DataFrame(report_label).transpose()
report_subclass_df = pd.DataFrame(report_subclass).transpose()

plt.figure(figsize=(10, 6))
report_label_df[['precision', 'recall', 'f1-score']].plot(kind='bar', legend=True)
plt.title('Classification Report for Binary Classification')
plt.xticks(rotation=0)
plt.xlabel('Class')
plt.ylabel('Score')
plt.grid()
plt.show()

plt.figure(figsize=(10, 6))
report_subclass_df[['precision', 'recall', 'f1-score']].plot(kind='bar', legend=True)
plt.title('Classification Report for Multi-class Classification (Subclass)')
plt.xticks(rotation=0)
plt.xlabel('Class')
plt.ylabel('Score')
plt.grid()
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x=y_subclass_pred, palette='viridis')
plt.title('Distribution of Predicted Subclasses')
plt.xlabel('Predicted Subclass')
plt.ylabel('Count')
plt.xticks(ticks=range(11), labels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], rotation=45)
plt.grid()
plt.show()
# # Step 1: Import Necessary Libraries
#
# # Import basic libraries
# import pandas as pd
# import numpy as np
# import re
# import string
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
#
# # For text preprocessing
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
#
#
# # Step 2: Load the Dataset
#
# # Load the dataset
# file_path = 'data/majordatasetwithsubcalass.csv'
# df = pd.read_csv(file_path)
#
# # Display the first few rows of the dataset
# df.head()
#
#
# # Step 3: Check Data Overview
# # Let's understand the dataset structure and check for any missing values.
#
# # Check basic information about the dataset
# df.info()
#
# # Check for missing values
# df.isnull().sum()
#
# # Drop rows with missing values (if any)
# df.dropna(inplace=True)
#
#
# # Step 4: Define a Function to Clean Marathi Text
# # We define a function to remove any non-Marathi characters from the text data. This step is
critical to ensure that only Marathi language characters are present.
#
# # Function to clean the text by removing non-Marathi characters
# def clean_marathi_text(text):
# # Use a regular expression to remove anything that is not a Marathi character
# marathi_only = re.sub(r'[^\u0900-\u097F\s]', '', text) # Unicode range for Marathi characters
# # Remove extra spaces
# marathi_only = re.sub(r'\s+', ' ', marathi_only).strip()
# return marathi_only
#
# # Apply the cleaning function to the 'Text' column
# df['cleaned_text'] = df['Text'].apply(clean_marathi_text)
#
# # Display the cleaned text
# df[['Text', 'cleaned_text']].head()
#
#
# # Step 5: Remove Stopwords (Optional)
# # If you have a list of Marathi stopwords (common words that don't contribute much to the
meaning), you can remove them to further clean the text.
#
# # Load Marathi stopwords from NLTK or define your own list of stopwords
# marathi_stopwords = set(stopwords.words('C:\HSD major
project\data\marathi_stopwords.txt'))
#
# # Function to remove stopwords from the textp
# def remove_stopwords(text):
# return ' '.join([word for word in text.split() if word not in marathi_stopwords])
#
# # Apply stopword removal
# df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)
#
# # Display the cleaned text after removing stopwords
# df[['cleaned_text']].head()
#
##
# # Step 5: Split Data into Features and Labels
# # We split the dataset into features (cleaned text) and two labels: Label (binary classification)
and Subclass (multi-class classification).
#
# # Features (X) and Labels (y_label, y_subclass)
# X = df['cleaned_text'] # Cleaned text as features
# y_label = df['Label'] # Binary label (hate speech or not)
# y_subclass = df['Subclass'] # Multi-class label (type of hate speech)
#
# # Split the data into training and testing sets
# X_train, X_test, y_label_train, y_label_test, y_subclass_train, y_subclass_test =
train_test_split(
# X, y_label, y_subclass, test_size=0.2, random_state=42
#)
#
# # Print data sizes
# print(f'Training set size: {X_train.shape[0]}')
# print(f'Testing set size: {X_test.shape[0]}')
#
#
# # Step 6: Vectorize Text Using TF-IDF
# # We will vectorize the text using TF-IDF for numerical representation.
#
# # Initialize TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
#
# # Fit and transform the training data
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
#
# # Transform the test data
# X_test_tfidf = tfidf_vectorizer.transform(X_test)
#
# # Check shape
# print(f'Train TF-IDF shape: {X_train_tfidf.shape}')
# print(f'Test TF-IDF shape: {X_test_tfidf.shape}')
#
#
# # Step 7: Train Logistic Regression for Label (Binary Classification)
# # Train a Logistic Regression model to classify the Label (hate speech or not).
#
# # Train Logistic Regression for binary classification
# lr_label = LogisticRegression(max_iter=1000)
# lr_label.fit(X_train_tfidf, y_label_train)
#
# # Predict on test set
# y_label_pred = lr_label.predict(X_test_tfidf)
#
# # Evaluate the binary classifier
# print("Binary Classification (Label) Report:\n", classification_report(y_label_test,
y_label_pred))
# print("Accuracy for Label:", accuracy_score(y_label_test, y_label_pred))
#
#
# # Step 8: Train Logistic Regression for Subclass (Multi-class Classification)
# # Now, we train another Logistic Regression model to classify the
#
# # Train Logistic Regression for subclass (multi-class classification)
# lr_subclass = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
# lr_subclass.fit(X_train_tfidf, y_subclass_train)
#
# # Predict on test set
# y_subclass_pred = lr_subclass.predict(X_test_tfidf)
#
# # Evaluate the multi-class classifier
# print("Multi-class Classification (Subclass) Report:\n", classification_report(y_subclass_test,
y_subclass_pred))
# print("Accuracy for Subclass:", accuracy_score(y_subclass_test, y_subclass_pred))
#
#
#
#
# # Step 9: Joint Prediction and Output
# # We can now jointly predict both Label and Subclass and output them together.
#
# # Make predictions for both binary Label and Subclass
# y_label_pred = lr_label.predict(X_test_tfidf)
# y_subclass_pred = lr_subclass.predict(X_test_tfidf)
#
# # Combine the results into a DataFrame for easier visualization
# results = pd.DataFrame({
# 'Text': X_test,
# 'Predicted_Label': y_label_pred,
# 'Actual_Label': y_label_test,
# 'Predicted_Subclass': y_subclass_pred,
# 'Actualpy_Subclass': y_subclass_test
# })
#
# # Display the results
# results.head()
#
#
# # Step 10: Saving Models and Vectorizer
# # Save the models and the vectorizer for deployment.
#
# import pickle
#
# # Save the Logistic Regression models and TF-IDF vectorizer
# with open('models/lr_label_model.pkl', 'wb') as label_file:
# pickle.dump(lr_label, label_file)
#
# with open('models/lr_subclass_model.pkl', 'wb') as subclass_file:
# pickle.dump(lr_subclass, subclass_file)
#
# with open('models/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
# pickle.dump(tfidf_vectorizer, vectorizer_file)

You might also like