Web_Scrapping.ipynb - Colab
Web_Scrapping.ipynb - Colab
ipynb - Colab
from bs4 import BeautifulSoup as bs # import beatuful soup library for HTML manipulation
import requests # The requests module allows you to send HTTP requests using Python.
link='https://www.amazon.in/Realme-Cyber-Black-128GB-Storage/product-reviews/B0915ZNCPR/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=
<Response [200]>
soup = bs(page.content,'html.parser') # parse the page object using beautifuSoup and store it in soup object
<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in">
<!-- sp:feature:head-start -->
<head>
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 1/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
<script>
var aPageStart = (new Date()).getTime();
</script>
<meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<!-- sp:feature:cs-optimization -->
<meta content="on" http-equiv="x-dns-prefetch-control"/>
<link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
<link href="https://m.media-amazon.com" rel="dns-prefetch"/>
<link href="https://completion.amazon.com" rel="dns-prefetch"/>
<!-- sp:end-feature:cs-optimization -->
<!-- sp:feature:aui-assets -->
<link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41wZkyTaWoL.css,31Y8m1dzTdL.css,01
<script>
(function(d,g,S,E){function F(a){v&&v.tag&&v.tag(p(":","aui",a))}function q(a,b){v&&v.count&&v.count("aui:"+a,0===b?0:b||(v.c
c}return c}function va(a,b,c){var d=c=a.length,f=function(){d--||(T.push(b),U||(setTimeout(fa,0),U=!0))};for(f();c--;)ha[a[c]]?f(
c,e,function(b){V?q("resource_unload"):e?(e=!1,q("resource_retry"),f()):(q("resource_error"),a.log("Asset failed to load: "+c));b
X.hasOwnProperty(c[e])?X[c[e]](H[c[e]],d):H[c[e]]);return f}function B(a,b,c,r,f){return function(e,g){function W(){var a=null;r?
a[J].replace(ka,"");var h=C[e||"anon"+ ++ya]={depend:I,registered:z(),namespace:y.namespace};e&&za.hasOwnProperty(e);c?W():va(I,y
u.pop();return e}}}function K(a,b){this.load={js:ia(this,!0),css:ia(this)};G(this,"namespace",b);G(this,"attribution",a)}function
(a=k.mobile||k.tablet?450<a.w&&a.w>a.h:1250<=a.w)?D(h,"a-ws"):h.className=aa(h,"a-ws")):0<N&&(N--,oa=setTimeout(L,16))}}function
P.join(" "))}var v=d.ue;F();F("aui_build_date:3.22.1-2022-05-13");var T=[],Ca=[],U=!1;var fa=function(){for(var a=setTimeout(fa,0
p(ca,f.message,"at "+p(":",b,c,r)):E);var e=u.pop()||{};f.attribution=p(":",f.attribution||e.attribution,e.name);f.logLevel=e.log
console.error(a));return!1},error:function(a,b,c,d){a=Error(p(":",d,a,c));a.attribution=p(":",this.attribution,b);throw a;},guard
!0),execute:B([]),AUI_BUILD_DATE:"3.22.1-2022-05-13",when:la(),now:la(!0),trigger:function(a,b,c){var g=O();this.declare(a,{data:
G(d,"AmazonUIPageJS",new K);var Q=n._namespace("PageJS","AmazonUI");Q.declare("prv:p-debug",C);n.declare("p-recorder-events",[]);
c.charAt(0).toUpperCase()+c.substr(1);c=(a.join(d+" ")+d+" "+c).split(" ");for(d=c.length;d--;)if(""===b.style[c[d]])return!0;ret
"svg").createSVGRect},offline:function(){return navigator.hasOwnProperty&&navigator.hasOwnProperty("onLine")&&navigator.onLine},d
g.createElement("textarea")},localStorage:function(){return"localStorage"in d&&null!==d.localStorage},orientation:function(){retu
ios:function(){return m(/OS [1-9][0-9]*(_[0-9]*)+ like Mac OS X/i)&&!m(/trident|Edge/i)},android:function(){return m(/android.([1
{w:0,h:0},N=4;L();x(d,"resize",function(){clearTimeout(oa);N=4;L()});var ua={getItem:function(a){try{return d.localStorage.getIte
"3.22.1-2022-05-13");n.register("p-detect",function(){return{capabilities:k,localStorage:k.localStorage&&ua,toggleResponsiveGrid:
if(b&&"retail_service_worker_messaging"===b.feature&&b.command&&b.data){var c=b.data;a=d.ue;var e=d.ueLogError;switch(b.command){
a.trigger(c.weblab,c.treatment);break;default:q("sw:unsupported_message_command",1)}}}function c(){e.forEach(function(a){F(a)})}f
mid:d.ue_mid,pty:d.ue_pty,sid:d.ue_sid,spty:d.ue_spty,furl:d.ue_furl})};x(f,"message",b);a("client_messaging_ready");n.when("load
(function(a){var b=a.reg,g=a.unreg;f&&f.getRegistrations?(Q.when("A").execute(function(a){h(a,g,"unregister")}),x(d,"load",functi
(function(b){function q(a,f,d){function e(a,b,c){var e=Array(f.length);~l&&(e[l]={});~m&&(e[m]=c);for(c=0;c<n.length;c++){var g=n
"@c/"),k=c?b.P._namespace(c):b.P,u=!g.lastIndexOf("@c/",0),n=[];a=[];var p=[],v=[],m=-1,l=-1;for(c=0;c<f.length;c++){var h=f[c];"
q:void 0}:e(a,[],function(){})});u&&k.when("mix:@amzn/mix.client-runtime","mix:"+g).execute(function(a,b){a.registerCapabilityMod
function(a){b.Promise=b.Promise||a}),(Array.prototype.includes?P:P.when("a-polyfill")).register("@p/polyfill-is-ready",function()
b.mixCardInitTimeouts[f]=setTimeout(function(){P.log("Client-side initialization timeout","WARN",a)},d)});b.mix_csa_map=b.mix_csa
b.mix_csa_internal_key(a,["producerId"])}catch(d){return P.logError(d,"MIX C005","ERROR",void 0),function(){}}try{return b.mix_cs
"MIX C004","ERROR",a),function(){}}}})(window);
(window.AmazonUIPageJS ? AmazonUIPageJS : P).when('sp.load.js').execute(function() {
(window.AmazonUIPageJS ? AmazonUIPageJS : P).load.js('https://images-eu.ssl-images-amazon.com/images/I/61wzgHm223L.js?AUIClient
(window.AmazonUIPageJS ? AmazonUIPageJS : P).load.js('https://images-eu.ssl-images-amazon.com/images/I/11Y+5x+kkTL._RC|5110husW
(window.AmazonUIPageJS ? AmazonUIPageJS : P).load.js('https://images-eu.ssl-images-amazon.com/images/I/51Vsv+W3nKL.js?AUIClient
});
</script>
<!-- sp:end-feature:aui-assets -->
<!-- sp:feature:nav-inline-css -->
<!-- NAVYAAN CSS -->
<style type="text/css">
names = soup.find_all('span',class_='a-profile-name') # get list of all span tag elements having class 'a-profile-name' using find_all m
['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'Vellingkirinathan Kuppusamy',
'KISHORE',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 2/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
cust_name.pop(2) # pop out second index element from list i.e 'Vellingkirinathan Kuppusamy'
cust_name # print the cust_name list
['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'KISHORE',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']
cust_name.pop(2) # pop out second index element from list i.e 'KISHORE'
cust_name # print the cust_name list
['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']
cust_name.pop(4) # pop out second index element from list i.e 'shivam kumar'
cust_name # print the cust_name list
['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']
cust_name.pop(1) # pop out second index element from list i.e 'Rajni Kumari'
cust_name # print the cust_name list
['Vellingkirinathan Kuppusamy',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']
title = soup.find_all('a',class_='review-title-content') # get list of all anchor tag elements having class 'review-title-content' using
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 3/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
['\nawesome mobile\n',
'\n4 star\n',
"\nGo for it it's a good choice\n",
'\nFeatures best\n',
'\nSuper phone thanks Amazon\n',
'\nAlso you will not get the option of exchange of this phone even now.dont waste your money\n',
'\nphone phone this phone for everyone\n']
review_title[:] = [titles.lstrip('\n') for titles in review_title] # remove the left '\n' from all items of review_title list
review_title # print the review_title list
['awesome mobile\n',
'4 star\n',
"Go for it it's a good choice\n",
'Features best\n',
'Super phone thanks Amazon\n',
'Also you will not get the option of exchange of this phone even now.dont waste your money\n',
'phone phone this phone for everyone\n']
review_title[:] = [titles.rstrip('\n') for titles in review_title] # remove the right '\n' from all items of review_title list
review_title # print the review_title list
['awesome mobile',
'4 star',
"Go for it it's a good choice",
'Features best',
'Super phone thanks Amazon',
'Also you will not get the option of exchange of this phone even now.dont waste your money',
'phone phone this phone for everyone']
'4 star'
len(review_title)
rating = soup.find_all('i',class_='review-rating') # get list of all icon tag elements having class 'review-rating' using find_all metho
rating # print the rating list
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 4/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
review = soup.find_all("span",{"data-hook":"review-body"}) # get list of all span tag elements having class 'review-body' using find_all
review # print the review list
['\nscreen battery processing speed style all are excellent. but photos colour segment highly changed comparing with real one. if
you ok with edited style images its a perfect cam for you else maybe you get little angry. but cam quality excellent.. I think its
a best mobile.!\n',
'\n4 star\n',
'\nPersonally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features\n',
'\nSuper\n',
'\nSuper phone thanks Amazon\n',
'\nPlz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor
brraleme 6 support 60fps in bgmi . Absolutely waste of money.\n',
'\nvery good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this\n']
review_content[:] = [reviews.lstrip('\n') for reviews in review_content] # remove the left '\n' from all items of review_content list
review_content # print the review_content list
['screen battery processing speed style all are excellent. but photos colour segment highly changed comparing with real one. if you
ok with edited style images its a perfect cam for you else maybe you get little angry. but cam quality excellent.. I think its a
best mobile.!\n',
'4 star\n',
'Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features\n',
'Super\n',
'Super phone thanks Amazon\n',
'Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor brraleme
6 support 60fps in bgmi . Absolutely waste of money.\n',
'very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this\n']
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 5/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
review_content.pop(0) # pop out 0th indexed item from review_content list
'screen battery processing speed style all are excellent. but photos colour segment
highly changed comparing with real one. if you ok with edited style images its a per
fect cam for you else maybe you get little angry. but cam quality excellent.. I thin
review_content[:] = [reviews.rstrip('\n') for reviews in review_content] # remove the right '\n' from all items of review_content list
review_content # print the review_content list
['4 star',
'Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features',
'Super',
'Super phone thanks Amazon',
'Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor brraleme
6 support 60fps in bgmi . Absolutely waste of money.',
'very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this']
['4 star',
'Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features',
'Super',
'Super phone thanks Amazon',
'Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor brraleme
6 support 60fps in bgmi . Absolutely waste of money.',
'very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this']
df # print df
df # print df
Customer Name
0 Vellingkirinathan Kuppusamy
1 Sushma shilpi
2 S. Jayapaul
3 shivam kumar
4 Rajni Kumari
5 Kasqk masaka
print(len(review_content))
print(len(review_title))
6
7
# similarly add columns 'Review title', 'Ratings', 'Reviews to df using review_title, rate, review_content list respectively
df['Review title']=review_title
df['Ratings']=rate
df['Reviews']=review_content
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 6/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
df
4.0 out of 5
2 S. Jayapaul Features best Super
stars
https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 7/7