0% found this document useful (0 votes)
10 views1 page

Se 3 Tal 5 Ees

The document contains code snippets for preprocessing text data in tweets including functions for counting words, characters, hashtags, numerics, uppercase letters as well as removing punctuation, stopwords, frequent and rare words, stemming, lemmatization and more.

Uploaded by

Mohamed Aymen
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views1 page

Se 3 Tal 5 Ees

The document contains code snippets for preprocessing text data in tweets including functions for counting words, characters, hashtags, numerics, uppercase letters as well as removing punctuation, stopwords, frequent and rare words, stemming, lemmatization and more.

Uploaded by

Mohamed Aymen
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

num_of_words* .apply( lambda x : len ( str (x).

split( " " )))


num_of_chars* .str.len()
avg_word_length* .apply( lambda x: avg_word(x))
stop_words* .apply( lambda x: len ([x for x in x.split() if x in stop]))
hash_tags* .apply( lambda x: len ([x for x in x.split() if x.startswith( '#'
)]))
num_numerics* .apply( lambda x: len ([x for x in x.split() if x.isdigit()]))
num_uppercase* .apply( lambda x: len ([x for x in x.split() if x.isupper()]))
lower_case .apply( lambda x: " " .join(x.lower() for x in x.split()))
punctuation_removal .str.replace( '[^\w\s]' , '' )
stop_words_removal .apply( lambda x: " " .join(x for x in x.split() if x not in stop))
frequent_words_removal .apply( lambda x: " " .join(x for x in x.split() if x not in freq))
rare_words_removal .apply( lambda x: " " .join(x for x in x.split() if x not in freq))
def spell_correction return df[ 'tweet' ][: 5 ].apply( lambda x: str
(df): (TextBlob(x).correct()))
def tokens (df): return TextBlob(df[ 'tweet' ][ 1 ]).words
def stemming (df): return df[ 'tweet' ][: 5 ].apply( lambda x: " " .join([st.stem(word)
for word in x.split()]))
lemmatization .apply( lambda x: " " .join([Word(word).lemmatize() for word in
x.split()])) #from textblob import Word ‫قبلها‬
upper_case .apply( lambda x: " " .join(x.upper() for x in x.split()))
lower_case .apply( lambda x: " " .join(x.lower() for x in x.split()))

def num_of_words (df): * def upper_case (df):


df[ 'tweet' ] = df[ 'tweet' ]
df[ 'word_count' ] = df[ 'tweet' ]
print (df[ 'tweet' ].head())
print (df[[ 'tweet' , 'word_count' ]].head())

def avg_word (sentence): #avg_word_length ‫قبل ال‬


words = sentence.split()
return (sum (len (word) for word in words) / len (words))

import nltk # stop_words ‫قبل ال‬ from textblob import TextBlob #spell_correction ‫قبل ال‬
from nltk.corpus import stopwords from nltk.stem import PorterStemmer
stop = stopwords.words('english') st = PorterStemmer() #stemming ‫قبل ال‬

freq = pd.Series(' ' .join(train['tweet']).split()).value_counts()[:10 ]


freq
freq = list (freq.index) # frequent_words_removal ‫قبل ال‬

freq = pd.Series(' ' .join(train['tweet']).split()).value_counts()[-10:]


freq
freq = list (freq.index) # rare_words_removal ‫قبل ال‬

You might also like