0% found this document useful (0 votes)

12 views10 pages

Main Py

The document outlines a process for building a hate speech detection model using a dataset of Marathi text. It includes steps for data cleaning, preprocessing, feature extraction with TF-IDF, and training logistic regression models for binary and multi-class classification. The results are evaluated using classification reports and confusion matrices, and the models are saved for future use.

Uploaded by

niraj21it

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

12 views10 pages

Main Py

Uploaded by

niraj21it

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 10

import pandas as pd

import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

file_path = '/content/niraj data set - Sheet1.csv'

df = pd.read_csv(file_path)
df.head()

df.info()
df.isnull().sum()
df.dropna(inplace=True)

label_counts = df['Label'].value_counts()
print("\nNumber of data rows for each main label:")
for label, count in label_counts.items():
label_name = "Hate Speech" if label == 1 else "Not Hate Speech"
print(f"{label_name} ({label}): {count}")

sublabel_counts = df['Subclass'].value_counts()
print("\nNumber of data rows for each sublabel:")
sublabel_mapping = {
1: "Defamatory/Insulting",
2: "Arrogative",
3: "Harassing",
4: "Gender Abusive",
5: "Racist",
6: "Religious Intolerant",
7: "Homophobic",
8: "Xenophobic",
9: "Disability Discriminatory",
10: "Classist",
0: "Not Hate Speech"
}
for sublabel, count in sublabel_counts.items():
sublabel_name = sublabel_mapping.get(sublabel, "Unknown")
print(f"{sublabel_name} ({sublabel}): {count}")

def clean_marathi_text(text):
marathi_only = re.sub(r'[^\u0900-\u097F\s]', '', text)
marathi_only = re.sub(r'\s+', ' ', marathi_only).strip()
return marathi_only

df['cleaned_text'] = df['Text'].apply(clean_marathi_text)
df[['Text', 'cleaned_text']].head()

marathi_stopwords = set(stopwords.words('/content/marathi_stopwords.txt'))

def remove_stopwords(text):
return ' '.join([word for word in text.split() if word not in marathi_stopwords])

df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)
df[['cleaned_text']].head()

X = df['cleaned_text']
y_label = df['Label']
y_subclass = df['Subclass']

X_train, X_test, y_label_train, y_label_test, y_subclass_train, y_subclass_test = train_test_split(

X, y_label, y_subclass, test_size=0.2, random_state=42
)

print(f'Training set size: {X_train.shape[0]}')

print(f'Testing set size: {X_test.shape[0]}')

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f'Train TF-IDF shape: {X_train_tfidf.shape}')

print(f'Test TF-IDF shape: {X_test_tfidf.shape}')

lr_label = LogisticRegression(max_iter=1000)
lr_label.fit(X_train_tfidf, y_label_train)

y_label_pred = lr_label.predict(X_test_tfidf)

print("Binary Classification (Label) Report:\n", classification_report(y_label_test, y_label_pred))

print("Accuracy for Label:", accuracy_score(y_label_test, y_label_pred))
lr_subclass = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
lr_subclass.fit(X_train_tfidf, y_subclass_train)

y_subclass_pred = lr_subclass.predict(X_test_tfidf)

print("Multi-class Classification (Subclass) Report:\n", classification_report(y_subclass_test,

y_subclass_pred))
print("Accuracy for Subclass:", accuracy_score(y_subclass_test, y_subclass_pred))

y_label_pred = lr_label.predict(X_test_tfidf)
y_subclass_pred = lr_subclass.predict(X_test_tfidf)

results = pd.DataFrame({
'Text': X_test,
'Predicted_Label': y_label_pred,
'Actual_Label': y_label_test,
'Predicted_Subclass': y_subclass_pred,
'Actual_Subclass': y_subclass_test
})

results.head()

import pickle

with open('lr_label_model.pkl', 'wb') as label_file:

pickle.dump(lr_label, label_file)

with open('lr_subclass_model.pkl', 'wb') as subclass_file:

pickle.dump(lr_subclass, subclass_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:

pickle.dump(tfidf_vectorizer, vectorizer_file)

import pickle
import re

with open('lr_label_model.pkl', 'rb') as label_file:

lr_label = pickle.load(label_file)

with open('lr_subclass_model.pkl', 'rb') as subclass_file:

lr_subclass = pickle.load(subclass_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:

tfidf_vectorizer = pickle.load(vectorizer_file)
def clean_marathi_text(text):
marathi_only = re.sub(r'[^\u0900-\u097F\s]', '', text)
marathi_only = re.sub(r'\s+', ' ', marathi_only).strip()
return marathi_only

def predict_hate_speech(sentence):
cleaned_sentence = clean_marathi_text(sentence)
sentence_tfidf = tfidf_vectorizer.transform([cleaned_sentence])
predicted_label = lr_label.predict(sentence_tfidf)[0]
predicted_subclass = lr_subclass.predict(sentence_tfidf)[0]
return predicted_label, predicted_subclass

example_sentence = "तम् ु ही एकदम बेजबाबदार आहात!"

label, subclass = predict_hate_speech(example_sentence)

print(f"Predicted Label (Hate Speech or Not): {label}")

print(f"Predicted Subclass (Type of Hate Speech): {subclass}")

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.metrics import confusion_matrix

cm_label = confusion_matrix(y_label_test, y_label_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_label, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Hate Speech', 'Hate
Speech'], yticklabels=['Not Hate Speech', 'Hate Speech'])
plt.title('Confusion Matrix for Binary Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

cm_subclass = confusion_matrix(y_subclass_test, y_subclass_pred)

plt.figure(figsize=(12, 8))
sns.heatmap(cm_subclass, annot=True, fmt='d', cmap='Greens', xticklabels=range(11),
yticklabels=range(11))
plt.title('Confusion Matrix for Multi-class Classification (Subclass)')
plt.xlabel('Predicted Subclass')
plt.ylabel('Actual Subclass')
plt.show()

report_label = classification_report(y_label_test, y_label_pred, output_dict=True)

report_subclass = classification_report(y_subclass_test, y_subclass_pred, output_dict=True)
report_label_df = pd.DataFrame(report_label).transpose()
report_subclass_df = pd.DataFrame(report_subclass).transpose()

plt.figure(figsize=(10, 6))
report_label_df[['precision', 'recall', 'f1-score']].plot(kind='bar', legend=True)
plt.title('Classification Report for Binary Classification')
plt.xticks(rotation=0)
plt.xlabel('Class')
plt.ylabel('Score')
plt.grid()
plt.show()

plt.figure(figsize=(10, 6))
report_subclass_df[['precision', 'recall', 'f1-score']].plot(kind='bar', legend=True)
plt.title('Classification Report for Multi-class Classification (Subclass)')
plt.xticks(rotation=0)
plt.xlabel('Class')
plt.ylabel('Score')
plt.grid()
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x=y_subclass_pred, palette='viridis')
plt.title('Distribution of Predicted Subclasses')
plt.xlabel('Predicted Subclass')
plt.ylabel('Count')
plt.xticks(ticks=range(11), labels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], rotation=45)
plt.grid()
plt.show()
# # Step 1: Import Necessary Libraries
#
# # Import basic libraries
# import pandas as pd
# import numpy as np
# import re
# import string
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
#
# # For text preprocessing
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
#
#
# # Step 2: Load the Dataset
#
# # Load the dataset
# file_path = 'data/majordatasetwithsubcalass.csv'
# df = pd.read_csv(file_path)
#
# # Display the first few rows of the dataset
# df.head()
#
#
# # Step 3: Check Data Overview
# # Let's understand the dataset structure and check for any missing values.
#
# # Check basic information about the dataset
# df.info()
#
# # Check for missing values
# df.isnull().sum()
#
# # Drop rows with missing values (if any)
# df.dropna(inplace=True)
#
#
# # Step 4: Define a Function to Clean Marathi Text
# # We define a function to remove any non-Marathi characters from the text data. This step is
critical to ensure that only Marathi language characters are present.
#
# # Function to clean the text by removing non-Marathi characters
# def clean_marathi_text(text):
# # Use a regular expression to remove anything that is not a Marathi character
# marathi_only = re.sub(r'[^\u0900-\u097F\s]', '', text) # Unicode range for Marathi characters
# # Remove extra spaces
# marathi_only = re.sub(r'\s+', ' ', marathi_only).strip()
# return marathi_only
#
# # Apply the cleaning function to the 'Text' column
# df['cleaned_text'] = df['Text'].apply(clean_marathi_text)
#
# # Display the cleaned text
# df[['Text', 'cleaned_text']].head()
#
#
# # Step 5: Remove Stopwords (Optional)
# # If you have a list of Marathi stopwords (common words that don't contribute much to the
meaning), you can remove them to further clean the text.
#
# # Load Marathi stopwords from NLTK or define your own list of stopwords
# marathi_stopwords = set(stopwords.words('C:\HSD major
project\data\marathi_stopwords.txt'))
#
# # Function to remove stopwords from the textp
# def remove_stopwords(text):
# return ' '.join([word for word in text.split() if word not in marathi_stopwords])
#
# # Apply stopword removal
# df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)
#
# # Display the cleaned text after removing stopwords
# df[['cleaned_text']].head()
#
##
# # Step 5: Split Data into Features and Labels
# # We split the dataset into features (cleaned text) and two labels: Label (binary classification)
and Subclass (multi-class classification).
#
# # Features (X) and Labels (y_label, y_subclass)
# X = df['cleaned_text'] # Cleaned text as features
# y_label = df['Label'] # Binary label (hate speech or not)
# y_subclass = df['Subclass'] # Multi-class label (type of hate speech)
#
# # Split the data into training and testing sets
# X_train, X_test, y_label_train, y_label_test, y_subclass_train, y_subclass_test =
train_test_split(
# X, y_label, y_subclass, test_size=0.2, random_state=42
#)
#
# # Print data sizes
# print(f'Training set size: {X_train.shape[0]}')
# print(f'Testing set size: {X_test.shape[0]}')
#
#
# # Step 6: Vectorize Text Using TF-IDF
# # We will vectorize the text using TF-IDF for numerical representation.
#
# # Initialize TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
#
# # Fit and transform the training data
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
#
# # Transform the test data
# X_test_tfidf = tfidf_vectorizer.transform(X_test)
#
# # Check shape
# print(f'Train TF-IDF shape: {X_train_tfidf.shape}')
# print(f'Test TF-IDF shape: {X_test_tfidf.shape}')
#
#
# # Step 7: Train Logistic Regression for Label (Binary Classification)
# # Train a Logistic Regression model to classify the Label (hate speech or not).
#
# # Train Logistic Regression for binary classification
# lr_label = LogisticRegression(max_iter=1000)
# lr_label.fit(X_train_tfidf, y_label_train)
#
# # Predict on test set
# y_label_pred = lr_label.predict(X_test_tfidf)
#
# # Evaluate the binary classifier
# print("Binary Classification (Label) Report:\n", classification_report(y_label_test,
y_label_pred))
# print("Accuracy for Label:", accuracy_score(y_label_test, y_label_pred))
#
#
# # Step 8: Train Logistic Regression for Subclass (Multi-class Classification)
# # Now, we train another Logistic Regression model to classify the
#
# # Train Logistic Regression for subclass (multi-class classification)
# lr_subclass = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
# lr_subclass.fit(X_train_tfidf, y_subclass_train)
#
# # Predict on test set
# y_subclass_pred = lr_subclass.predict(X_test_tfidf)
#
# # Evaluate the multi-class classifier
# print("Multi-class Classification (Subclass) Report:\n", classification_report(y_subclass_test,
y_subclass_pred))
# print("Accuracy for Subclass:", accuracy_score(y_subclass_test, y_subclass_pred))
#
#
#
#
# # Step 9: Joint Prediction and Output
# # We can now jointly predict both Label and Subclass and output them together.
#
# # Make predictions for both binary Label and Subclass
# y_label_pred = lr_label.predict(X_test_tfidf)
# y_subclass_pred = lr_subclass.predict(X_test_tfidf)
#
# # Combine the results into a DataFrame for easier visualization
# results = pd.DataFrame({
# 'Text': X_test,
# 'Predicted_Label': y_label_pred,
# 'Actual_Label': y_label_test,
# 'Predicted_Subclass': y_subclass_pred,
# 'Actualpy_Subclass': y_subclass_test
# })
#
# # Display the results
# results.head()
#
#
# # Step 10: Saving Models and Vectorizer
# # Save the models and the vectorizer for deployment.
#
# import pickle
#
# # Save the Logistic Regression models and TF-IDF vectorizer
# with open('models/lr_label_model.pkl', 'wb') as label_file:
# pickle.dump(lr_label, label_file)
#
# with open('models/lr_subclass_model.pkl', 'wb') as subclass_file:
# pickle.dump(lr_subclass, subclass_file)
#
# with open('models/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
# pickle.dump(tfidf_vectorizer, vectorizer_file)

Marathi Hate Speech Detection IEEE Paper
No ratings yet
Marathi Hate Speech Detection IEEE Paper
5 pages
Kathrin Melcher, Rosaria Silipo - Codeless Deep Learning With KNIME-PACKT Publishing LTD. (2020)
No ratings yet
Kathrin Melcher, Rosaria Silipo - Codeless Deep Learning With KNIME-PACKT Publishing LTD. (2020)
313 pages
ML (1) (Lab)
No ratings yet
ML (1) (Lab)
51 pages
Personalized Cancer Diagnosis
No ratings yet
Personalized Cancer Diagnosis
100 pages
Data Mining Numericals
No ratings yet
Data Mining Numericals
38 pages
Ir Practical Manual 2
No ratings yet
Ir Practical Manual 2
24 pages
ML Lab Manual-2019
No ratings yet
ML Lab Manual-2019
85 pages
IR - Group1
No ratings yet
IR - Group1
27 pages
Text Classification With Switch Transformer
No ratings yet
Text Classification With Switch Transformer
27 pages
Beginner's Guide To Data Cleaning and Feature Extraction in NLP - by Enes Gokce - Towards Data Science
No ratings yet
Beginner's Guide To Data Cleaning and Feature Extraction in NLP - by Enes Gokce - Towards Data Science
20 pages
Practical 1
No ratings yet
Practical 1
18 pages
03 The-Different-Methods-Deal-Text-Data-Predictive-Python
No ratings yet
03 The-Different-Methods-Deal-Text-Data-Predictive-Python
16 pages
NLP Labsheet-2 Sentiment Analysis Using Naive Bayes Classifier
No ratings yet
NLP Labsheet-2 Sentiment Analysis Using Naive Bayes Classifier
15 pages
2024 Acl-Long 778
No ratings yet
2024 Acl-Long 778
15 pages
03 ML Essentials
No ratings yet
03 ML Essentials
52 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
Reg. No.: 39110009 Colab Notebook Link: Name: Abivirshan Suresh
No ratings yet
Reg. No.: 39110009 Colab Notebook Link: Name: Abivirshan Suresh
27 pages
Aped For Fake News
No ratings yet
Aped For Fake News
6 pages
App Py
No ratings yet
App Py
7 pages
Index: SR. NO. Practical Name Date of Perform NO. Sign
No ratings yet
Index: SR. NO. Practical Name Date of Perform NO. Sign
23 pages
School of Engineering: Lab Manual On Machine Learning Lab
No ratings yet
School of Engineering: Lab Manual On Machine Learning Lab
23 pages
ML Week10.1
No ratings yet
ML Week10.1
5 pages
Hate Speech - POS
No ratings yet
Hate Speech - POS
7 pages
Ment Analysis Text Classification
No ratings yet
Ment Analysis Text Classification
9 pages
Report On - Social Media Research Topic Modeling
No ratings yet
Report On - Social Media Research Topic Modeling
26 pages
Topic Classifierby David Caleb
No ratings yet
Topic Classifierby David Caleb
7 pages
FakeNewsDetection Student
No ratings yet
FakeNewsDetection Student
7 pages
17 - Source Code - nlp-2-5
No ratings yet
17 - Source Code - nlp-2-5
4 pages
ML (1) (Lab)
No ratings yet
ML (1) (Lab)
18 pages
Information Retrival
No ratings yet
Information Retrival
43 pages
DS - Lab Report.
No ratings yet
DS - Lab Report.
25 pages
MLA TAB Lecture2
No ratings yet
MLA TAB Lecture2
84 pages
Big Data
No ratings yet
Big Data
5 pages
Ai Lab Final
No ratings yet
Ai Lab Final
21 pages
NLP Tushar
No ratings yet
NLP Tushar
21 pages
7th Sem Report File
No ratings yet
7th Sem Report File
41 pages
Blue Doodle Project Presentation
No ratings yet
Blue Doodle Project Presentation
15 pages
Multi-Class Text Classification With Scikit-Learn
No ratings yet
Multi-Class Text Classification With Scikit-Learn
20 pages
Se 3 Tal 5 Ees
No ratings yet
Se 3 Tal 5 Ees
1 page
Email Spam Classifier
No ratings yet
Email Spam Classifier
22 pages
Sma 3
No ratings yet
Sma 3
3 pages
Rubrics For Mini Project
No ratings yet
Rubrics For Mini Project
3 pages
AI Phash3
No ratings yet
AI Phash3
11 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
Big Data Merged
No ratings yet
Big Data Merged
7 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
DSBA+Master+Codebook+ +Text+Mining+&+TSF
No ratings yet
DSBA+Master+Codebook+ +Text+Mining+&+TSF
11 pages
Methodology
No ratings yet
Methodology
9 pages
Shreya Srivastava-27
No ratings yet
Shreya Srivastava-27
3 pages
Fraud Detection in Python Chapter4
No ratings yet
Fraud Detection in Python Chapter4
33 pages
2022 A Hybrid DenseNet121-UNet Model For Brain Tumor Segmentation From MR Images
No ratings yet
2022 A Hybrid DenseNet121-UNet Model For Brain Tumor Segmentation From MR Images
9 pages
Data Preprocessing
No ratings yet
Data Preprocessing
38 pages
Lec 18
100% (1)
Lec 18
34 pages
Lab5 Example Fall 23
No ratings yet
Lab5 Example Fall 23
4 pages
Cyberbullying Code
No ratings yet
Cyberbullying Code
6 pages
Class 10 - Sahodaya - Artificial Intelligence - AK
No ratings yet
Class 10 - Sahodaya - Artificial Intelligence - AK
11 pages
Ie ML Project (Getting Started)
No ratings yet
Ie ML Project (Getting Started)
3 pages
Simple NMT
No ratings yet
Simple NMT
3 pages
Machine Learning Notes Anna University
No ratings yet
Machine Learning Notes Anna University
9 pages
Deep Learning Practical File
No ratings yet
Deep Learning Practical File
36 pages
Used Price Prediction
No ratings yet
Used Price Prediction
4 pages
Anomaly Detection in Surveillance
No ratings yet
Anomaly Detection in Surveillance
9 pages
Identification of Medicinal Plants Using Deep Learning Synopsis
No ratings yet
Identification of Medicinal Plants Using Deep Learning Synopsis
2 pages
Research Article
No ratings yet
Research Article
7 pages
Preprints202502 2059 v1
No ratings yet
Preprints202502 2059 v1
19 pages
02-knn Notes
No ratings yet
02-knn Notes
23 pages
Stock Price Prediction NEPSE
No ratings yet
Stock Price Prediction NEPSE
36 pages
In-Class Exercise Solutions - Perceptrons
No ratings yet
In-Class Exercise Solutions - Perceptrons
23 pages
Assignment - 13: Title
No ratings yet
Assignment - 13: Title
2 pages
Spam Email Detection Using Machine Learning
No ratings yet
Spam Email Detection Using Machine Learning
8 pages
Sheet1 1
No ratings yet
Sheet1 1
2 pages
Srujan ML 2 Project Fin
No ratings yet
Srujan ML 2 Project Fin
39 pages
Lasso Vs Ridge Vs Elastic 1
No ratings yet
Lasso Vs Ridge Vs Elastic 1
5 pages
A Novel Lightweight Real Time Traffic Sign Detection Method Based On An Embedded Device and Yolov8
No ratings yet
A Novel Lightweight Real Time Traffic Sign Detection Method Based On An Embedded Device and Yolov8
10 pages
Report
No ratings yet
Report
2 pages
(S1 IJEECS 2021 Rohit Chivukula) Classifying Clinically KNN and SVM
No ratings yet
(S1 IJEECS 2021 Rohit Chivukula) Classifying Clinically KNN and SVM
8 pages
Random - Forest - Classification - Ipynb - Colab
No ratings yet
Random - Forest - Classification - Ipynb - Colab
3 pages
Detection of Fraud Statement Based On Word Vector Evidence From Financial Companies in China - ScienceDirect
No ratings yet
Detection of Fraud Statement Based On Word Vector Evidence From Financial Companies in China - ScienceDirect
9 pages
PEMODELAN PREDIKSI KESEHATAN MENTAL MAHASISWA DI LINGKUNGAN MULTIKULTURAL MENGGUNAKAN ALGORITMA DECISION TREE J48 Eng
No ratings yet
PEMODELAN PREDIKSI KESEHATAN MENTAL MAHASISWA DI LINGKUNGAN MULTIKULTURAL MENGGUNAKAN ALGORITMA DECISION TREE J48 Eng
7 pages
Filipino Online Scam Data Classification Decision Tree Algorithms
No ratings yet
Filipino Online Scam Data Classification Decision Tree Algorithms
6 pages
Project 5j
No ratings yet
Project 5j
2 pages
Dyslexia Deep Clustering Using Webcam-Based Eye Tracking
No ratings yet
Dyslexia Deep Clustering Using Webcam-Based Eye Tracking
9 pages
06 Regression With Simple Data Preparation
No ratings yet
06 Regression With Simple Data Preparation
2 pages
C Programming
From Everand
C Programming
Netra
No ratings yet
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Java Programming Tutorial With Screen Shots & Many Code Example
From Everand
Java Programming Tutorial With Screen Shots & Many Code Example
Desmond Ohwofosirai
No ratings yet
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

Main Py

Uploaded by

Main Py

Uploaded by

import pandas as pd

file_path = '/content/niraj data set - Sheet1.csv'

X_train, X_test, y_label_train, y_label_test, y_subclass_train, y_subclass_test = train_test_split(

print(f'Training set size: {X_train.shape[0]}')

print(f'Train TF-IDF shape: {X_train_tfidf.shape}')

print("Binary Classification (Label) Report:\n", classification_report(y_label_test, y_label_pred))

print("Multi-class Classification (Subclass) Report:\n", classification_report(y_subclass_test,

with open('lr_label_model.pkl', 'wb') as label_file:

with open('lr_subclass_model.pkl', 'wb') as subclass_file:

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:

with open('lr_label_model.pkl', 'rb') as label_file:

with open('lr_subclass_model.pkl', 'rb') as subclass_file:

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:

example_sentence = "तम् ु ही एकदम बेजबाबदार आहात!"

print(f"Predicted Label (Hate Speech or Not): {label}")

import matplotlib.pyplot as plt

cm_label = confusion_matrix(y_label_test, y_label_pred)

cm_subclass = confusion_matrix(y_subclass_test, y_subclass_pred)

report_label = classification_report(y_label_test, y_label_pred, output_dict=True)

You might also like