0% found this document useful (0 votes)
17 views

Nlp2.ipynb - Colab

22jj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views

Nlp2.ipynb - Colab

22jj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

4/19/24, 4:06 PM nlp2.

ipynb - Colab

1 import nltk
2 from sklearn.feature_extraction.text import CountVectorizer
3 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...


[nltk_data] Unzipping tokenizers/punkt.zip.
True

1 # Sample data
2 corpus = [
3 "SPPU is the one of the best university in India.",
4 "India has already allowded so many new universities.",
5 "AICTE is main authority in technical education.",
6 "UGC and AICTE allowded technical education in india?",
7 ]
8

1 # Create the Bag of Words model


2 vectorizer = CountVectorizer()
3 X_bow = vectorizer.fit_transform(corpus)

1 # Get feature names and transformed data


2 feature_names_bow = vectorizer.get_feature_names_out()
3 bow_matrix = X_bow.toarray()

1 # Print feature names and BoW matrix


2 print("Feature Names (BoW):", feature_names_bow)
3 print("BoW Matrix:\n", bow_matrix)

Feature Names (BoW): ['aicte' 'allowded' 'already' 'and' 'authority' 'best' 'education' 'has'
'in' 'india' 'is' 'main' 'many' 'new' 'of' 'one' 'so' 'sppu' 'technical'
'the' 'ugc' 'universities' 'university']
BoW Matrix:
[[0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 0 2 0 0 1]
[0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0]
[1 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0]
[1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0]]

1 !pip install scikit-learn gensim nltk


2 from sklearn.feature_extraction.text import TfidfVectorizer

Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)


Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (4.3.2)
Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.25.2)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.11.4)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.4.0)
Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim) (6.4.0)
Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.12.25)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.2)

1 # Create the TF-IDF model


2 vectorizer_tfidf = TfidfVectorizer()
3 X_tfidf = vectorizer_tfidf.fit_transform(corpus)

1 # Get feature names and transformed data


2 feature_names_tfidf = vectorizer_tfidf.get_feature_names_out()
3 tfidf_matrix = X_tfidf.toarray()

1 # Print feature names and TF-IDF matrix


2 print("Feature Names (TF-IDF):", feature_names_tfidf)
3 print("TF-IDF Matrix:\n", tfidf_matrix)

Feature Names (TF-IDF): ['aicte' 'allowded' 'already' 'and' 'authority' 'best' 'education' 'has'
'in' 'india' 'is' 'main' 'many' 'new' 'of' 'one' 'so' 'sppu' 'technical'
'the' 'ugc' 'universities' 'university']
TF-IDF Matrix:
[[0. 0. 0. 0. 0. 0.30954541
0. 0. 0.19757882 0.19757882 0.24404915 0.
0. 0. 0.30954541 0.30954541 0. 0.30954541
0. 0.61909081 0. 0. 0.30954541]
[0. 0.29737611 0.37718389 0. 0. 0.
0. 0.37718389 0. 0.24075159 0. 0.
0.37718389 0.37718389 0. 0. 0.37718389 0.
0. 0. 0. 0.37718389 0. ]
[0.35639424 0. 0. 0. 0.4520409 0.
0.35639424 0. 0.28853185 0. 0.35639424 0.4520409

https://colab.research.google.com/drive/1GWw2psLJ4rs1IT5iUg9xdoMOZ87BiYaF#scrollTo=9b32KeWq6dd-&uniqifier=2&printMode=true 1/3
4/19/24, 4:06 PM nlp2.ipynb - Colab
0. 0. 0. 0. 0. 0.
0.35639424 0. 0. 0. 0. ]
[0.34242558 0.34242558 0. 0.43432343 0. 0.
0.34242558 0. 0.27722302 0.27722302 0. 0.
0. 0. 0. 0. 0. 0.
0.34242558 0. 0.43432343 0. 0. ]]

1 from gensim.models import Word2Vec


2 from nltk.tokenize import word_tokenize

1 # Tokenize the documents


2 tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
3 # Train the Word2Vec model
4 model_w2v = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
5

1 # Get the Word2Vec embeddings for each word


2 embeddings_w2v = [model_w2v.wv[word] for doc in tokenized_corpus for word in doc]
3

1 print("Word2Vec Embeddings (Example):", embeddings_w2v[:5])

output Word2Vec Embeddings (Example): [array([-4.9724146e-03, -1.2821439e-03, 3.2808294e-03, -6.4131343e-03,


-9.7032748e-03, -9.2617292e-03, 9.0226065e-03, 5.3696753e-03,
-4.7882134e-03, -8.3339782e-03, 1.2951550e-03, 2.8790133e-03,
-1.2458978e-03, 1.2699742e-03, -4.3185740e-03, 4.7948617e-03,
1.4796027e-03, 8.8773808e-03, -9.9788336e-03, -5.2726669e-03,
-9.1006216e-03, -3.4521171e-04, -7.8554507e-03, 5.0299861e-03,
-6.3978485e-03, -5.9502255e-03, 5.0689173e-03, -8.1629418e-03,
1.4552462e-03, -7.2365543e-03, 9.8626213e-03, 8.6347228e-03,
1.7700142e-03, 5.7870778e-03, 4.5951647e-03, -5.9907152e-03,
9.7548291e-03, -9.6800094e-03, 8.0489898e-03, 2.7558431e-03,
-3.0530239e-03, -3.5616157e-03, 9.0742577e-03, -5.4402603e-03,
8.1877513e-03, -6.0094744e-03, 8.3887624e-03, -5.5658707e-04,
7.9459315e-03, -3.1532587e-03, 5.9769000e-03, 8.8024903e-03,
2.5420673e-03, 1.3162253e-03, 5.0389166e-03, 8.0063958e-03,
8.5699316e-03, 8.4947534e-03, 7.0525687e-03, 8.0050612e-03,
8.6004017e-03, -3.2667242e-05, -1.0029497e-03, 1.6668305e-03,
4.6866389e-06, 6.8768725e-04, -8.6033335e-03, -9.5959110e-03,
-2.3133331e-03, 8.9247189e-03, -3.6475467e-03, -6.9804057e-03,
4.8784767e-03, 1.0698296e-03, 1.8517259e-03, 3.6527361e-03,
3.5221805e-03, 5.7269363e-03, 1.2339676e-03, 8.4258645e-04,
9.0451026e-03, 2.7826610e-03, -4.7025373e-03, 6.5429192e-03,
5.2161720e-03, 2.8710719e-03, -3.1352045e-03, 3.3388904e-03,
6.3642915e-03, 7.0779454e-03, 9.4181398e-04, -8.5304342e-03,
2.5565538e-04, 3.7333352e-04, 3.9412794e-03, -9.4706584e-03,
9.7080907e-03, -6.9747777e-03, 5.7595358e-03, -9.4276723e-03],
dtype=float32), array([-0.0071398 , 0.00124439, -0.00717616, -0.00223565, 0.00371874,
0.00583367, 0.001202 , 0.00210848, -0.00410963, 0.00722465,
-0.00630294, 0.00464309, -0.0082172 , 0.00204422, -0.00497717,
-0.00425125, -0.00310916, 0.00565882, 0.00579249, -0.00497653,
0.00077368, -0.00849352, 0.00780642, 0.00925912, -0.00274006,
0.00079614, 0.00074861, 0.00547782, -0.00860957, 0.00058059,
0.00686888, 0.00222321, 0.00112738, -0.00932088, 0.00847669,
-0.00625879, -0.00298613, 0.00349368, -0.00077095, 0.00141088,
0.00178102, -0.00682666, -0.00973249, 0.00904355, 0.00619567,
-0.00691088, 0.00339972, 0.00020398, 0.00475398, -0.00711601,
0.00402788, 0.00434206, 0.0099519 , -0.00447311, -0.00138774,
-0.00731545, -0.00969014, -0.00908436, -0.00102474, -0.00650439,
0.00484432, -0.00616408, 0.0025211 , 0.00072896, -0.00339727,
-0.00097363, 0.00997826, 0.00914278, -0.00446263, 0.00908478,
-0.00564142, 0.00593425, -0.00309757, 0.00342886, 0.00302015,
0.006903 , -0.00237185, 0.00877823, 0.00758474, -0.0095498 ,
-0.00801289, -0.00763687, 0.00292587, -0.00279558, -0.00693359,
-0.00812493, 0.00830964, 0.00197929, -0.00933083, -0.00478753,
0.00313186, -0.0047108 , 0.00528206, -0.00423214, 0.00264669,
-0.00804493, 0.00620823, 0.00481998, 0.00078511, 0.00301797],
dtype=float32), array([ 8.1650205e-03, -4.4393395e-03, 8.9832470e-03, 8.2537076e-03,
-4.4381348e-03, 3.0088305e-04, 4.2714751e-03, -3.9304695e-03,
-5.5628875e-03, -6.5138922e-03, -6.7317014e-04, -2.9316242e-04,
4.4594160e-03, -2.4768524e-03, -1.6832585e-04, 2.4654416e-03,
4.8718420e-03, -2.8879360e-05, -6.3401391e-03, -9.2649423e-03,
2.9410048e-05, 6.6641076e-03, 1.4697608e-03, -8.9649623e-03,
-7.9361815e-03, 6.5568490e-03, -3.7907732e-03, 6.2528555e-03,
-6.6814339e-03, 8.4838886e-03, -6.5139448e-03, 3.2910376e-03,
-1.0536474e-03, -6.7908973e-03, -3.2850883e-03, -1.1634642e-03,
-5.4759043e-03, -1.2073567e-03, -7.5638522e-03, 2.6458006e-03,
9.0703918e-03, -2.3795378e-03, -9.7446056e-04, 3.5161036e-03,
8.6651891e-03, -5.9261033e-03, -6.8902504e-03, -2.9335832e-03,
9.1518667e-03, 8.6510333e-04, -8.6797718e-03, -1.4467967e-03,

https://colab.research.google.com/drive/1GWw2psLJ4rs1IT5iUg9xdoMOZ87BiYaF#scrollTo=9b32KeWq6dd-&uniqifier=2&printMode=true 2/3
4/19/24, 4:06 PM nlp2.ipynb - Colab

https://colab.research.google.com/drive/1GWw2psLJ4rs1IT5iUg9xdoMOZ87BiYaF#scrollTo=9b32KeWq6dd-&uniqifier=2&printMode=true 3/3

You might also like