0% found this document useful (0 votes)
8 views

Python Project

This document loads news article text and label data, preprocesses the text by removing stopwords and special characters, splits the data into training and test sets, applies TF-IDF vectorization, trains a PassiveAggressiveClassifier model on the training set, predicts labels on the test set, and evaluates the model's accuracy and confusion matrix. Key steps include loading CSV data, preprocessing text, splitting into train and test, applying TF-IDF, training a PAC model, predicting on the test set, and evaluating accuracy.

Uploaded by

bebshnnsjs
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

Python Project

This document loads news article text and label data, preprocesses the text by removing stopwords and special characters, splits the data into training and test sets, applies TF-IDF vectorization, trains a PassiveAggressiveClassifier model on the training set, predicts labels on the test set, and evaluates the model's accuracy and confusion matrix. Key steps include loading CSV data, preprocessing text, splitting into train and test, applying TF-IDF, training a PAC model, predicting on the test set, and evaluating accuracy.

Uploaded by

bebshnnsjs
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 2

# Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources (if not downloaded)


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset (assuming it's in CSV format)


data = pd.read_csv('/news.csv') # Replace 'your_dataset.csv' with your file name

# Explore the dataset


print(data.head()) # Check the first few rows
print(data.info()) # Get information about the dataset

# Data preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
# Convert text to lowercase
text = text.lower()

# Remove special characters and digits


text = re.sub(r'\W', ' ', text)
text = re.sub(r'\d', ' ', text)

# Tokenize the text


words = word_tokenize(text)

# Remove stop words and lemmatize tokens


words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

# Join words back into text


processed_text = ' '.join(words)
return processed_text

data['text'] = data['text'].apply(preprocess_text)

# Feature extraction
X = data['text']
y = data['label']

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Model building - using Passive Aggressive Classifier


model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

# Prediction
y_pred = model.predict(tfidf_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")


print(f"Confusion Matrix:\n{conf_matrix}")

You might also like