0% found this document useful (0 votes)

14 views5 pages

Sma Exp 03 Code Print

The document outlines an experiment to scrape Twitter data related to the Union Budget 2023 using Python libraries such as snscrape and pandas. It details the process of data cleaning, preprocessing, and exploratory data analysis, including sentiment analysis using TextBlob and visualization techniques like word clouds. The final output includes sentiment distribution and visual representations of positive, negative, and neutral sentiments from the tweets.

Uploaded by

imvishank

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views5 pages

Sma Exp 03 Code Print

Uploaded by

imvishank

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 5

EXPERIMENT NO - 03

CODE :

1. Scrape Twitter Data for Union Budget 2023

!pip install snscrape import pandas as pd

import snscrape.modules.twitter as sntwitter import numpy as np
import matplotlib.pyplot as plt import seaborn as sns
import nltk nltk.download(‘stopwords’)

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from
nltk.stem.porter import PorterStemmer import string
import re import textblob
from textblob import TextBlob import os
from wordcloud import WordCloud, STOPWORDS from wordcloud import ImageColorGenerator
import warnings
%matplotlib inline
os.system(“snscrape –jsonl –max-results 5000 –since 2023-01-31 twitter-search ‘Budget 2023
until:2023-02-07’>text-query-tweets.json”)
tweets_df = pd.read_json(“text-query-tweets.json” ,lines=True) tweets_df.head(5)
tweets_df.to_csv()

2. Data Loading
df1 = tweets_df[[ ‘date’, ‘rawContent’ , ‘renderedContent’ , ‘user’ , ‘replyCount’
,’retweetCount’ , ‘likeCount’ , ‘lang’ , ‘place’ , ‘hashtags’ , ‘viewCount’]].copy() df1.head()
df1.shape

3. Twitter Data Cleaning, Preprocessing and Exploratory Data Analysis

df1=df1.drop_duplicates(“renderedContent”) df1.shape
df1.head df1.info
df1.date.value_counts() plt.figure(figsize=(17, 5))
sns.heatmap(df1.isnull(), cbar=True, yticklabels=False) plt.xlabel(“Column_Name”, size=14,
weight=”bold”) plt.title(“Places of missing values in column”,size=17)

plt.show()
import plotly.graph_objects as go
Top_Location_Of_tweet= df1[‘place’].value_counts().head (10)

Twitter Data Cleaning and Preprocessing

from nltk. corpus import stopwords stop = stopwords.words('english')

df1['renderedContent'].apply(lambda x: [item for item in x if item not in stop])

df1.shape
!pip install tweet-preprocessor #Remove unnecessary characters punct = ['%','/',':','\\','&amp','&',';','?']
def remove_punctuations(text):
for punctuation in punct:
text = text.replace(punctuation,'') return text
df1['renderedContent'] = df1['renderedContent'].apply(lambda x: remove_punctuations(x))
df1['renderedContent'].replace( '', np.nan, inplace=True)
df1.dropna(subset=["renderedContent"],inplace=True) len(df1)
df1 = df1.reset_index(drop=True) df1.head()
from sklearn.feature_extraction. text import TfidfVectorizer, CountVectorizer
sns.set_style('whitegrid')
%matplotlib inline
stop=stop+['budget2023' , 'budget' , 'httpst' , '2023', 'modi' ,'nsitaraman' , 'union', 'pmindia' , 'tax' ,
'india']
def plot_20_most_common_words(count_data, count_vectorizer) :

import matplotlib. pyplot as plt

words = count_vectorizer.get_feature_names()

total_counts = np. zeros(len(words)) for t in count_data:

total_counts = t.toarray()[0]

count_dict = (zip(words, total_counts))

count_dict = sorted(count_dict, key=lambda x:x[1],reverse=True)[0:20] words = [w[0] for w in
count_dict]
counts = [w[1] for w in count_dict] x_pos = np.arange(len(words))

plt.figure(2, (40,40))
plt.subplot(title = '20 most common words')
sns. set_context('notebook',font_scale=4,rc={ 'lines.linewidth' :2.5}) sns.barplot(x_pos, counts,
palette='husl')
plt.xticks(x_pos, words, rotation=90) plt.xlabel('words')
plt.ylabel('counts') plt.show()

count_vectorizer = CountVectorizer(stop_words=stop) # Fit and transform the processed titles

count_data = count_vectorizer.fit_transform(df1['renderedContent']) # print(count_vectorizer)
# print(count_data)
# Visualise the 20 most common words plot_20_most_common_words(count_data,count_vectorizer)
plt.savefig( 'saved_figure.png')
import cufflinks as cf cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

def get_top_n_bigram(corpus, n=None) :

vec = CountVectorizer(ngram_range=(2, 4), stop_words="english").fit(corpus) bag_of_words =
vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)
words_freq =[(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) return words_freq[:n]

common_words = get_top_n_bigram(df1['renderedContent'] , 8) mydict={}

for word, freq in common_words:
bigram_df = pd.DataFrame(common_words,columns = ['ngram', 'count'])

bigram_df.groupby( 'ngram'
).sum()['count'].sort_values(ascending=False).sort_values().plot.barh(title = 'Top 8
bigrams',color='orange' , width=.4, figsize=(12,8),stacked = True)
def get_subjectivity(text):
return TextBlob(text).sentiment.subjectivity def get_polarity(text):
return TextBlob(text).sentiment.polarity
df1['subjectivity']=df1[ 'renderedContent'].apply(get_subjectivity) df1[ 'polarity' ]=df1[
'renderedContent'].apply(get_polarity) df1.head()
df1['textblob_score'] =df1[ 'renderedContent'].apply(lambda x: TextBlob(x).sentiment.polarity)
neutral_threshold=0.05
df1['textblob_sentiment']=df1[ 'textblob_score'].apply(lambda c:'positive' if c >= neutral_threshold
else ('Negative' if c <= -(neutral_threshold) else 'Neutral' ) ) textblob_df =
df1[['renderedContent','textblob_sentiment','likeCount']] textblob_df
textblob_df["textblob_sentiment"].value_counts()
textblob_df["textblob_sentiment"].value_counts().plot.barh(title = 'Sentiment Analysis',color='orange'
, width=.4, figsize=(12,8),stacked = True)
df_positive=textblob_df[textblob_df['textblob_sentiment']=='positive' ]
df_very_positive=df_positive[df_positive['likeCount']>0] df_very_positive.head()

df_negative=textblob_df[textblob_df['textblob_sentiment']=='Negative' ] df_negative
df_neutral=textblob_df[textblob_df['textblob_sentiment']=='Neutral' ] df_neutral
from wordcloud import WordCloud, STOPWORDS

from PIL import Image #Creating the text variable

positive_tw =" ".join(t for t in df_very_positive.renderedContent)
# Creating word _ cloud with text as argument in . generate() rtpthod word_cloud1 =
WordCloud(collocations = False, background_color = 'white')
.generate(positive_tw)
# Display the generated Word Cloud
plt. imshow(word_cloud1, interpolation='bilinear') plt.axis('off')
plt.show()
#Creating the text variable
negative_tw =" ".join(t for t in df_negative.renderedContent)
# Creating word _ cloud with text as argument in . generate() rtpthod word_cloud2 =
WordCloud(collocations = False, background_color = 'white')
.generate(negative_tw)
# Display the generated Word Cloud
plt. imshow(word_cloud2, interpolation='bilinear') plt.axis('off')
plt.show()
#Creating the text variable
neutral_tw =" ".join(t for t in df_neutral.renderedContent)
# Creating word _ cloud with text as argument in . generate() rtpthod word_cloud2 =
WordCloud(collocations = False, background_color = 'white')
.generate(neutral_tw)
# Display the generated Word Cloud
plt. imshow(word_cloud2, interpolation='bilinear') plt.axis('off')
plt.show()
OUTPUT :

Biology 13th Edition Raven Full Download
100% (2)
Biology 13th Edition Raven Full Download
408 pages
Numerology AIFAS Eng PDF
100% (3)
Numerology AIFAS Eng PDF
73 pages
Meteors Comets, Asteroids And: Science 8
100% (2)
Meteors Comets, Asteroids And: Science 8
30 pages
Lab Manual: 18CS3262S Data Modelling and Visualization Techniques
33% (3)
Lab Manual: 18CS3262S Data Modelling and Visualization Techniques
17 pages
Python Library Functions
No ratings yet
Python Library Functions
12 pages
Coconut Husk RRS
No ratings yet
Coconut Husk RRS
6 pages
Final Print Py Spark
No ratings yet
Final Print Py Spark
133 pages
1ab29bb7-bd81-49c3-8a8e-c373e8db6363
No ratings yet
1ab29bb7-bd81-49c3-8a8e-c373e8db6363
947 pages
Education Planning For Quality Report
No ratings yet
Education Planning For Quality Report
46 pages
Twitter Project2
No ratings yet
Twitter Project2
339 pages
Certificate of Compliance: Stanley Engineered Fastening India PVT LTD
No ratings yet
Certificate of Compliance: Stanley Engineered Fastening India PVT LTD
1 page
Assessing Regulatory Emotional Self-Efficacy in Three Countries
100% (1)
Assessing Regulatory Emotional Self-Efficacy in Three Countries
21 pages
DRI-HBI Induction Furnace Application
100% (2)
DRI-HBI Induction Furnace Application
7 pages
Guide To Mathematical Modelling
No ratings yet
Guide To Mathematical Modelling
10 pages
Physics II Notes
No ratings yet
Physics II Notes
211 pages
Data Science Algorithmen Master - 02 Data Handling
No ratings yet
Data Science Algorithmen Master - 02 Data Handling
76 pages
NNDL Notes
No ratings yet
NNDL Notes
73 pages
NLP Transformer-Based Models Used For Sentiment Analysis: 1. BERT
No ratings yet
NLP Transformer-Based Models Used For Sentiment Analysis: 1. BERT
98 pages
Data Science Papers
No ratings yet
Data Science Papers
109 pages
Cyberbullying Code
No ratings yet
Cyberbullying Code
6 pages
4.1 Data Retrieval and Preprocessing of Python
No ratings yet
4.1 Data Retrieval and Preprocessing of Python
57 pages
Documentary Compressed
No ratings yet
Documentary Compressed
15 pages
NLP Transformer-Based Models Used For Sentiment Analysis
No ratings yet
NLP Transformer-Based Models Used For Sentiment Analysis
45 pages
Ke Record 2k21
No ratings yet
Ke Record 2k21
48 pages
Data Science Project
No ratings yet
Data Science Project
34 pages
Social Media Sentimental Analysis 1
No ratings yet
Social Media Sentimental Analysis 1
30 pages
Sentiment Analysis - Comparing Algorithms Accuracy
No ratings yet
Sentiment Analysis - Comparing Algorithms Accuracy
22 pages
IP Practical 2024-25 (1 To 34)
No ratings yet
IP Practical 2024-25 (1 To 34)
33 pages
Rimjhim
No ratings yet
Rimjhim
21 pages
ML 2
No ratings yet
ML 2
25 pages
Aadarsh
No ratings yet
Aadarsh
26 pages
ML (Sudhanshu)
No ratings yet
ML (Sudhanshu)
24 pages
DS - Lab Report.
No ratings yet
DS - Lab Report.
25 pages
Lesson 9 TCW Report
No ratings yet
Lesson 9 TCW Report
14 pages
EXP5
No ratings yet
EXP5
15 pages
Code
No ratings yet
Code
18 pages
12 Simple Life Lessons Summary
No ratings yet
12 Simple Life Lessons Summary
3 pages
Twitter Sentiment Analysis Dss
No ratings yet
Twitter Sentiment Analysis Dss
14 pages
A Finite Element-Based Approach For Predictions of Rigid Pile Group Stiffness Efficiency in Clays
No ratings yet
A Finite Element-Based Approach For Predictions of Rigid Pile Group Stiffness Efficiency in Clays
16 pages
Discourse Analysis Unit 4
No ratings yet
Discourse Analysis Unit 4
14 pages
Twitter Sentiment Analysis
No ratings yet
Twitter Sentiment Analysis
13 pages
Python For Exploratory Data Analysis
No ratings yet
Python For Exploratory Data Analysis
12 pages
Code
No ratings yet
Code
13 pages
Applied Thermal Engineering: Han-Taw Chen, Shih-Ting Lai, Li-Ying Haung
No ratings yet
Applied Thermal Engineering: Han-Taw Chen, Shih-Ting Lai, Li-Ying Haung
9 pages
4251 Assignment 2
No ratings yet
4251 Assignment 2
9 pages
Tweet-Sentiment-Extraction - Exploratory Data Analysis
No ratings yet
Tweet-Sentiment-Extraction - Exploratory Data Analysis
11 pages
Part C Assignment No 2 Mini Project On Twitter 1
No ratings yet
Part C Assignment No 2 Mini Project On Twitter 1
9 pages
Problem Statement
No ratings yet
Problem Statement
10 pages
Estacion Experiment 7 Report
No ratings yet
Estacion Experiment 7 Report
10 pages
Data Analyzer
No ratings yet
Data Analyzer
10 pages
2025 Humphrey Reference Letter Forms
No ratings yet
2025 Humphrey Reference Letter Forms
4 pages
121a1114 D2 Sma Exp3
No ratings yet
121a1114 D2 Sma Exp3
9 pages
Part C - Assignment No. 2 Mini-Project On Twitter
No ratings yet
Part C - Assignment No. 2 Mini-Project On Twitter
7 pages
Data Minning Assignment #1: Submitted By: Rahul Kumar Roll No: 160BTCCSE010 Class: CSE A, 3rd Year
No ratings yet
Data Minning Assignment #1: Submitted By: Rahul Kumar Roll No: 160BTCCSE010 Class: CSE A, 3rd Year
9 pages
Methodology
No ratings yet
Methodology
9 pages
ELT Using Pandas
No ratings yet
ELT Using Pandas
5 pages
10 Streamlit
No ratings yet
10 Streamlit
7 pages
Part C - Assignment No. 2 Mini-Project On Twitter
No ratings yet
Part C - Assignment No. 2 Mini-Project On Twitter
7 pages
Importing Packages: Id Label Tweet 0 1 2 3 4
No ratings yet
Importing Packages: Id Label Tweet 0 1 2 3 4
8 pages
Chandru Lab 3
No ratings yet
Chandru Lab 3
7 pages
Exercises 5
No ratings yet
Exercises 5
7 pages
Adithiyaa BR 23MBA0018 SMA DA Text Mining PDF
No ratings yet
Adithiyaa BR 23MBA0018 SMA DA Text Mining PDF
6 pages
Communication Skills Assignment
No ratings yet
Communication Skills Assignment
3 pages
Main - Py Text File
No ratings yet
Main - Py Text File
5 pages
Twitter Analysis
No ratings yet
Twitter Analysis
5 pages
Assignment 3 - Open Data (Searching and Sorting) - CISC 121 - Introduction To Computing Science (ASO) S24
No ratings yet
Assignment 3 - Open Data (Searching and Sorting) - CISC 121 - Introduction To Computing Science (ASO) S24
5 pages
Sentiment Analysis
No ratings yet
Sentiment Analysis
5 pages
SMA4
No ratings yet
SMA4
5 pages
Sma Exp 09 Code Print
No ratings yet
Sma Exp 09 Code Print
5 pages
Wrangle Report
No ratings yet
Wrangle Report
4 pages
A Comparative Study of A-Star Algorithms For Search and Rescue in Perfect Maze
No ratings yet
A Comparative Study of A-Star Algorithms For Search and Rescue in Perfect Maze
5 pages
Positi Vis M
No ratings yet
Positi Vis M
5 pages
Sma 3
No ratings yet
Sma 3
3 pages
Neet Questions
No ratings yet
Neet Questions
4 pages
CEM Skills Nigerian Voiceover Script FULL
No ratings yet
CEM Skills Nigerian Voiceover Script FULL
4 pages
ESD-Final Exam - SCS3140-013
No ratings yet
ESD-Final Exam - SCS3140-013
4 pages
Wrangle Report
No ratings yet
Wrangle Report
3 pages
Homework 04
No ratings yet
Homework 04
2 pages
Exam Practice Questions
No ratings yet
Exam Practice Questions
3 pages
Igcse Ccsbio 2ed TR Ws Answers 4
No ratings yet
Igcse Ccsbio 2ed TR Ws Answers 4
2 pages
Fulltext Chromatography v3 Id1041
No ratings yet
Fulltext Chromatography v3 Id1041
4 pages
Gathering Data: E-Predictions/image-Predictions - TSV
No ratings yet
Gathering Data: E-Predictions/image-Predictions - TSV
3 pages
Code en Python3.9
No ratings yet
Code en Python3.9
1 page
m201sp18PS22 hw7b
No ratings yet
m201sp18PS22 hw7b
1 page
Ra R120MK
No ratings yet
Ra R120MK
1 page
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
C Language Programming Codes
From Everand
C Language Programming Codes
Durgesh
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet

Sma Exp 03 Code Print

Uploaded by

Sma Exp 03 Code Print

Uploaded by

EXPERIMENT NO - 03

1. Scrape Twitter Data for Union Budget 2023

!pip install snscrape import pandas as pd

from nltk.corpus import stopwords

3. Twitter Data Cleaning, Preprocessing and Exploratory Data Analysis

Twitter Data Cleaning and Preprocessing

from nltk. corpus import stopwords stop = stopwords.words('english')

import matplotlib. pyplot as plt

total_counts = np. zeros(len(words)) for t in count_data:

count_dict = (zip(words, total_counts))

count_vectorizer = CountVectorizer(stop_words=stop) # Fit and transform the processed titles

def get_top_n_bigram(corpus, n=None) :

common_words = get_top_n_bigram(df1['renderedContent'] , 8) mydict={}

from PIL import Image #Creating the text variable

You might also like