0% found this document useful (0 votes)
10 views

Web_Scrapping.ipynb - Colab

Uploaded by

adityakauthkar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

Web_Scrapping.ipynb - Colab

Uploaded by

adityakauthkar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

4/25/24, 8:00 PM Web_Scrapping.

ipynb - Colab

add Code add Text


Open in Colab

from bs4 import BeautifulSoup as bs # import beatuful soup library for HTML manipulation
import requests # The requests module allows you to send HTTP requests using Python.

link='https://www.amazon.in/Realme-Cyber-Black-128GB-Storage/product-reviews/B0915ZNCPR/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=

page = requests.get(link) # get response of request in object page

page # print status of response object

<Response [200]>

page.content # print content of response object

b'<!doctype html><html lang="en-in" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var


aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n<!-- sp:end-feature:head-start -->\n\n<!-- sp:feature:cs-
optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-
eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch"
href="https://completion.amazon.com">\n<!-- sp:end-feature:cs-optimization -->\n\n<!-- sp:feature:aui-assets -->\n<link
rel="stylesheet" href="https://images-eu.ssl-images-
amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41wZkyTaWoL.css,31Y8m1dzTdL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.c
XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,41CYNGpGlrL.css,01XPHJk60-
L.css,114y0SIP+yL.css,21aPhFy+riL.css,11gneA3MtJL.css,21fecG8pUzL.css,01ulGzBW88L.css,01CFUgsA-
YL.css,31C80IiXalL.css,11qour3ND0L.css,11gKCCKQV+L.css,11061HxnEvL.css,11oHt2HYxnL.css,013RDhw9hoL.css,11JQtnL-
6eL.css,116v6uYvN6L.css,11jtXRmppwL.css,01QrWuRrZ-
L.css,21zuRztKjtL.css,11QyqG8yiqL.css,11K24eOJg4L.css,11F2+OBzLyL.css,01890+Vwk8L.css,11Y05DTEL6L.css,01cbS3UK11L.css,21F85am0yFL
AUIClients/AmazonUI#not-trident" />\n<script>\n(function(d,g,S,E){function F(a){v&&v.tag&&v.tag(p(":","aui",a))}function q(a,b)
{v&&v.count&&v.count("aui:"+a,0===b?0:b||(v.count("aui:"+a)||0)+1)}function m(a){try{return a.test(navigator.userAgent)}catch(b)
{return!1}}function w(a){return"function"===typeof a}function x(a,b,c){a.addEventListener?
a.addEventListener(b,c,!1):a.attachEvent&&a.attachEvent("on"+b,c)}function p(a,b,c,d){b=b&&c?b+a+c:b||c;return d?
p(a,b,d):b}function G(a,b,c){try{Object.defineProperty(a,b,{value:c,writable:!1})}catch(r){a[b]=\nc}return c}function va(a,b,c)
{var d=c=a.length,f=function(){d--||(T.push(b),U||(setTimeout(fa,0),U=!0))};for(f();c--;)ha[a[c]]?f():(A[a[c]]=A[a[c]]||
[]).push(f)}function wa(a,b,c,d,f){var e=g.createElement(a?"script":"link");x(e,"error",d);f&&x(e,"load",f);a?
(e.type="text/javascript",e.async=!0,c&&/AUIClients|images[/]I/.test(b)&&e.setAttribute("crossorigin","anonymous"),e.src=b):
(e.rel="stylesheet",e.href=b);g.getElementsByTagName("head")[0].appendChild(e)}function ia(a,b){return function(c,r){function
f(){wa(b,\nc,e,function(b){V?q("resource_unload"):e?(e=!1,q("resource_retry"),f()):(q("resource_error"),a.log("Asset failed to
load: "+c));b&&b.stopPropagation?b.stopPropagation():d.event&&
(d.event.cancelBubble=!0)},r)}if(ja[c])return!1;ja[c]=!0;q("resource_count");var e=!0;return!f()}}function xa(a,b,c){for(var d=
{name:a,guard:function(c){return b.guardFatal(a,c)},guardTime:function(a){return b.guardTime(a)},logError:function(c,e,d)
{b.logError(c,e,d,a)}},f=[],e=0;e<c.length;e++)H.hasOwnProperty(c[e])&&(f[e]=\nX.hasOwnProperty(c[e])?X[c[e]]
(H[c[e]],d):H[c[e]]);return f}function B(a,b,c,r,f){return function(e,g){function W(){var a=null;r?a=g:w(g)&&
(h.start=z(),a=g.apply(d,xa(e,y,I)),h.end=z());if(b){H[e]=a;a=e;for(ha[a]=!0;(A[a]||[]).length;)A[a].shift()();delete
A[a]}h.done=!0}var y=f||this;w(e)&&(g=e,e=E);b&&(e=e?e.replace(ka,""):"__NONAME__",Y.hasOwnProperty(e)&&y.error(p(",
reregistered by ",p(" by ",e+" already registered",Y[e]),y.attribution),e),Y[e]=y.attribution);for(var I=
[],J=0;J<a.length;J++)I[J]=\na[J].replace(ka,"");var h=C[e||"anon"+ ++ya]=
{depend:I,registered:z(),namespace:y.namespace};e&&za.hasOwnProperty(e);c?
W():va(I,y.guardFatal(e,W),e);return{decorate:function(a){X[e]=y.guardFatal(e,a)}}}}function la(a){return function(){var
b=Array.prototype.slice.call(arguments);return{execute:B(b,!1,a,!1,this),register:B(b,!0,a,!1,this)}}}function Z(a,b){return
function(c,d){d||(d=c,c=E);var f=this.attribution;return function(){u.push(b||{attribution:f,name:c,logLevel:a});var
e=d.apply(this,arguments);\nu.pop();return e}}}function K(a,b){this.load=
{js:ia(this,!0),css:ia(this)};G(this,"namespace",b);G(this,"attribution",a)}function ma(){g.body?n.trigger("a-
bodyBegin"):setTimeout(ma,20)}function D(a,b){a.className=aa(a,b)+" "+b}function aa(a,b){return(" "+a.className+" ").split("
"+b+" ").join(" ").replace(/^ | $/g,"")}function na(a){try{return a()}catch(b){return!1}}function L(){if(M){var a=
{w:d.innerWidth||h.clientWidth,h:d.innerHeight||h.clientHeight};5<Math.abs(a.w-ba.w)||50<a.h-ba.h?
(ba=a,N=4,\n(a=k.mobile||k.tablet?450<a.w&&a.w>a.h:1250<=a.w)?D(h,"a-ws"):h.className=aa(h,"a-ws")):0<N&&(N-
-,oa=setTimeout(L,16))}}function Aa(a){(M=a===E?!M:!!a)&&L()}function Ba(){return M}"use strict";var O=S.now=S.now||function()
{return+new S},z=function(a){return a&&a.now?a.now.bind(a):O}(d.performance),P=z(),za=
{},t=d.AmazonUIPageJS||d.P;if(t&&t.when&&t.register){P=[];for(var
l=g.currentScript;l;l=l.parentElement)l.id&&P.push(l.id);return t.log("A copy of P has already been loaded on this
page.","FATAL",\nP.join(" "))}var v=d.ue;F();F("aui_build_date:3.22.1-2022-05-13");var T=[],Ca=[],U=!1;var fa=function(){for(var
a=setTimeout(fa,0),b=O();Ca.length||T.length;)if(T.shift()(),50<O()-b)return;clearTimeout(a);U=!1};var ha={},A={},ja=
{},V=!1;x(d,"beforeunload",function(){V=!0;setTimeout(function(){V=!1},1E4)});var ka=/^prv:/,Y={},H={},X={},C=
{},ya=0,ca=String.fromCharCode(92),u=[],pa=!0,qa=d.onerror;d.onerror=function(a,b,c,r,f){f&&"object"===typeof f||
(f=Error(a,b,c),f.columnNumber=r,f.stack=b||c||r?\np(ca,f.message,"at "+p(":",b,c,r)):E);var e=u.pop()||
{};f.attribution=p(":",f.attribution||e.attribution,e.name);f.logLevel=e.logLevel;f.attribution&&console&&console.log&&console.lo
by",f.attribution].join(" "));u=[];qa&&(e=[].slice.call(arguments),e[4]=f,qa.apply(d,e))};K.prototype=
{logError:function(a,b,c,r){b={message:b,logLevel:c||"ERROR",attribution:p(":",this.attribution,r)};if(d.ueLogError)return
d.ueLogError(a||b,a?b:null),!0;console&&console.error&&(console.log(b),\nconsole.error(a));return!1},error:function(a,b,c,d)
{a=Error(p(":",d,a,c));a.attribution=p(":",this.attribution,b);throw
a;},guardError:Z(),guardFatal:Z("FATAL"),guardCurrent:function(a){var b=u[u.length-1];return b?
Z(b.logLevel,b).call(this,a):a},guardTime:function(a){var b=u[u.length-1],c=b&&b.name;return c&&c in C?function(){var

soup = bs(page.content,'html.parser') # parse the page object using beautifuSoup and store it in soup object

print(soup.prettify()) # print the formatted HTML code using prettify function

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in">
<!-- sp:feature:head-start -->
<head>

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 1/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
<script>
var aPageStart = (new Date()).getTime();
</script>
<meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<!-- sp:feature:cs-optimization -->
<meta content="on" http-equiv="x-dns-prefetch-control"/>
<link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
<link href="https://m.media-amazon.com" rel="dns-prefetch"/>
<link href="https://completion.amazon.com" rel="dns-prefetch"/>
<!-- sp:end-feature:cs-optimization -->
<!-- sp:feature:aui-assets -->
<link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41wZkyTaWoL.css,31Y8m1dzTdL.css,01
<script>
(function(d,g,S,E){function F(a){v&&v.tag&&v.tag(p(":","aui",a))}function q(a,b){v&&v.count&&v.count("aui:"+a,0===b?0:b||(v.c
c}return c}function va(a,b,c){var d=c=a.length,f=function(){d--||(T.push(b),U||(setTimeout(fa,0),U=!0))};for(f();c--;)ha[a[c]]?f(
c,e,function(b){V?q("resource_unload"):e?(e=!1,q("resource_retry"),f()):(q("resource_error"),a.log("Asset failed to load: "+c));b
X.hasOwnProperty(c[e])?X[c[e]](H[c[e]],d):H[c[e]]);return f}function B(a,b,c,r,f){return function(e,g){function W(){var a=null;r?
a[J].replace(ka,"");var h=C[e||"anon"+ ++ya]={depend:I,registered:z(),namespace:y.namespace};e&&za.hasOwnProperty(e);c?W():va(I,y
u.pop();return e}}}function K(a,b){this.load={js:ia(this,!0),css:ia(this)};G(this,"namespace",b);G(this,"attribution",a)}function
(a=k.mobile||k.tablet?450<a.w&&a.w>a.h:1250<=a.w)?D(h,"a-ws"):h.className=aa(h,"a-ws")):0<N&&(N--,oa=setTimeout(L,16))}}function
P.join(" "))}var v=d.ue;F();F("aui_build_date:3.22.1-2022-05-13");var T=[],Ca=[],U=!1;var fa=function(){for(var a=setTimeout(fa,0
p(ca,f.message,"at "+p(":",b,c,r)):E);var e=u.pop()||{};f.attribution=p(":",f.attribution||e.attribution,e.name);f.logLevel=e.log
console.error(a));return!1},error:function(a,b,c,d){a=Error(p(":",d,a,c));a.attribution=p(":",this.attribution,b);throw a;},guard
!0),execute:B([]),AUI_BUILD_DATE:"3.22.1-2022-05-13",when:la(),now:la(!0),trigger:function(a,b,c){var g=O();this.declare(a,{data:
G(d,"AmazonUIPageJS",new K);var Q=n._namespace("PageJS","AmazonUI");Q.declare("prv:p-debug",C);n.declare("p-recorder-events",[]);
c.charAt(0).toUpperCase()+c.substr(1);c=(a.join(d+" ")+d+" "+c).split(" ");for(d=c.length;d--;)if(""===b.style[c[d]])return!0;ret
"svg").createSVGRect},offline:function(){return navigator.hasOwnProperty&&navigator.hasOwnProperty("onLine")&&navigator.onLine},d
g.createElement("textarea")},localStorage:function(){return"localStorage"in d&&null!==d.localStorage},orientation:function(){retu
ios:function(){return m(/OS [1-9][0-9]*(_[0-9]*)+ like Mac OS X/i)&&!m(/trident|Edge/i)},android:function(){return m(/android.([1
{w:0,h:0},N=4;L();x(d,"resize",function(){clearTimeout(oa);N=4;L()});var ua={getItem:function(a){try{return d.localStorage.getIte
"3.22.1-2022-05-13");n.register("p-detect",function(){return{capabilities:k,localStorage:k.localStorage&&ua,toggleResponsiveGrid:
if(b&&"retail_service_worker_messaging"===b.feature&&b.command&&b.data){var c=b.data;a=d.ue;var e=d.ueLogError;switch(b.command){
a.trigger(c.weblab,c.treatment);break;default:q("sw:unsupported_message_command",1)}}}function c(){e.forEach(function(a){F(a)})}f
mid:d.ue_mid,pty:d.ue_pty,sid:d.ue_sid,spty:d.ue_spty,furl:d.ue_furl})};x(f,"message",b);a("client_messaging_ready");n.when("load
(function(a){var b=a.reg,g=a.unreg;f&&f.getRegistrations?(Q.when("A").execute(function(a){h(a,g,"unregister")}),x(d,"load",functi
(function(b){function q(a,f,d){function e(a,b,c){var e=Array(f.length);~l&&(e[l]={});~m&&(e[m]=c);for(c=0;c<n.length;c++){var g=n
"@c/"),k=c?b.P._namespace(c):b.P,u=!g.lastIndexOf("@c/",0),n=[];a=[];var p=[],v=[],m=-1,l=-1;for(c=0;c<f.length;c++){var h=f[c];"
q:void 0}:e(a,[],function(){})});u&&k.when("mix:@amzn/mix.client-runtime","mix:"+g).execute(function(a,b){a.registerCapabilityMod
function(a){b.Promise=b.Promise||a}),(Array.prototype.includes?P:P.when("a-polyfill")).register("@p/polyfill-is-ready",function()
b.mixCardInitTimeouts[f]=setTimeout(function(){P.log("Client-side initialization timeout","WARN",a)},d)});b.mix_csa_map=b.mix_csa
b.mix_csa_internal_key(a,["producerId"])}catch(d){return P.logError(d,"MIX C005","ERROR",void 0),function(){}}try{return b.mix_cs
"MIX C004","ERROR",a),function(){}}}})(window);
(window.AmazonUIPageJS ? AmazonUIPageJS : P).when('sp.load.js').execute(function() {
(window.AmazonUIPageJS ? AmazonUIPageJS : P).load.js('https://images-eu.ssl-images-amazon.com/images/I/61wzgHm223L.js?AUIClient
(window.AmazonUIPageJS ? AmazonUIPageJS : P).load.js('https://images-eu.ssl-images-amazon.com/images/I/11Y+5x+kkTL._RC|5110husW
(window.AmazonUIPageJS ? AmazonUIPageJS : P).load.js('https://images-eu.ssl-images-amazon.com/images/I/51Vsv+W3nKL.js?AUIClient
});
</script>
<!-- sp:end-feature:aui-assets -->
<!-- sp:feature:nav-inline-css -->
<!-- NAVYAAN CSS -->
<style type="text/css">

names = soup.find_all('span',class_='a-profile-name') # get list of all span tag elements having class 'a-profile-name' using find_all m

names # print the names list

[<span class="a-profile-name">Vellingkirinathan Kuppusamy</span>,


<span class="a-profile-name">Rajni Kumari</span>,
<span class="a-profile-name">Vellingkirinathan Kuppusamy</span>,
<span class="a-profile-name">KISHORE</span>,
<span class="a-profile-name">Sushma shilpi</span>,
<span class="a-profile-name">S. Jayapaul</span>,
<span class="a-profile-name">shivam kumar</span>,
<span class="a-profile-name">shivam kumar</span>,
<span class="a-profile-name">Rajni Kumari</span>,
<span class="a-profile-name">Kasqk masaka</span>]

cust_name = [] # create an empty list cust_name


for i in range(0,len(names)):
cust_name.append(names[i].get_text()) # append text value of tag in cust_name
cust_name # print the cust_name list

['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'Vellingkirinathan Kuppusamy',
'KISHORE',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 2/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
cust_name.pop(2) # pop out second index element from list i.e 'Vellingkirinathan Kuppusamy'
cust_name # print the cust_name list

['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'KISHORE',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']

cust_name.pop(2) # pop out second index element from list i.e 'KISHORE'
cust_name # print the cust_name list

['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']

cust_name.pop(4) # pop out second index element from list i.e 'shivam kumar'
cust_name # print the cust_name list

['Vellingkirinathan Kuppusamy',
'Rajni Kumari',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']

cust_name.pop(1) # pop out second index element from list i.e 'Rajni Kumari'
cust_name # print the cust_name list

['Vellingkirinathan Kuppusamy',
'Sushma shilpi',
'S. Jayapaul',
'shivam kumar',
'Rajni Kumari',
'Kasqk masaka']

title = soup.find_all('a',class_='review-title-content') # get list of all anchor tag elements having class 'review-title-content' using

title # print the title list

[<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"


href="/gp/customer-reviews/R2V00ZREW3NM57?ASIN=B0915ZNCPR">
<span>awesome mobile</span>
</a>,
<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"
href="/gp/customer-reviews/R3PC41N57KP8FA?ASIN=B0915ZNCPR">
<span>4 star</span>
</a>,
<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"
href="/gp/customer-reviews/R34TDA1L2I3AIM?ASIN=B0915ZNCPR">
<span>Go for it it's a good choice</span>
</a>,
<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"
href="/gp/customer-reviews/R2RNAD4ES1NQ6W?ASIN=B0915ZNCPR">
<span>Features best</span>
</a>,
<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"
href="/gp/customer-reviews/R2LSVWJCTHILBY?ASIN=B0915ZNCPR">
<span>Super phone thanks Amazon</span>
</a>,
<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"
href="/gp/customer-reviews/RGVXOZU7SIJ03?ASIN=B0915ZNCPR">
<span>Also you will not get the option of exchange of this phone even now.dont waste your money</span>
</a>,
<a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title"
href="/gp/customer-reviews/R2BGF85DZTU7ZU?ASIN=B0915ZNCPR">
<span>phone phone this phone for everyone</span>
</a>]

review_title = [] # create an empty list named review_title


for i in range(0,len(title)):
review_title.append(title[i].get_text()) # append text value of tag in review_title
review_title # print the review_title list

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 3/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab

['\nawesome mobile\n',
'\n4 star\n',
"\nGo for it it's a good choice\n",
'\nFeatures best\n',
'\nSuper phone thanks Amazon\n',
'\nAlso you will not get the option of exchange of this phone even now.dont waste your money\n',
'\nphone phone this phone for everyone\n']

review_title[:] = [titles.lstrip('\n') for titles in review_title] # remove the left '\n' from all items of review_title list
review_title # print the review_title list

['awesome mobile\n',
'4 star\n',
"Go for it it's a good choice\n",
'Features best\n',
'Super phone thanks Amazon\n',
'Also you will not get the option of exchange of this phone even now.dont waste your money\n',
'phone phone this phone for everyone\n']

review_title[:] = [titles.rstrip('\n') for titles in review_title] # remove the right '\n' from all items of review_title list
review_title # print the review_title list

['awesome mobile',
'4 star',
"Go for it it's a good choice",
'Features best',
'Super phone thanks Amazon',
'Also you will not get the option of exchange of this phone even now.dont waste your money',
'phone phone this phone for everyone']

review_title.pop(1) # pop out first index element from list

'4 star'

len(review_title)

rating = soup.find_all('i',class_='review-rating') # get list of all icon tag elements having class 'review-rating' using find_all metho
rating # print the rating list

[<i class="a-icon a-icon-star a-star-5 review-rating" data-hook="review-star-rating-view-point"><span class="a-icon-alt">5.0 out of


5 stars</span></i>,
<i class="a-icon a-icon-star a-star-1 review-rating" data-hook="review-star-rating-view-point"><span class="a-icon-alt">1.0 out of
5 stars</span></i>,
<i class="a-icon a-icon-star a-star-5 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">5.0 out of 5
stars</span></i>,
<i class="a-icon a-icon-star a-star-4 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">4.0 out of 5
stars</span></i>,
<i class="a-icon a-icon-star a-star-5 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">5.0 out of 5
stars</span></i>,
<i class="a-icon a-icon-star a-star-4 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">4.0 out of 5
stars</span></i>,
<i class="a-icon a-icon-star a-star-5 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">5.0 out of 5
stars</span></i>,
<i class="a-icon a-icon-star a-star-1 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">1.0 out of 5
stars</span></i>,
<i class="a-icon a-icon-star a-star-5 review-rating" data-hook="review-star-rating"><span class="a-icon-alt">5.0 out of 5
stars</span></i>]

rate = [] # create an empty list named rate


for i in range(0,len(rating)):
rate.append(rating[i].get_text()) # append text value of tag in rate
rate # print the rate list

['5.0 out of 5 stars',


'1.0 out of 5 stars',
'5.0 out of 5 stars',
'4.0 out of 5 stars',
'5.0 out of 5 stars',
'4.0 out of 5 stars',
'5.0 out of 5 stars',
'1.0 out of 5 stars',
'5.0 out of 5 stars']

len(rate) # print the length/size of rate list

rate.pop(4) # pop out 4th indexed item from rate list

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 4/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab

'5.0 out of 5 stars'

rate.pop(0) # pop out 0th indexed item from rate list

'5.0 out of 5 stars'

rate.pop(0) # pop out 0th indexed item from rate list

'1.0 out of 5 stars'

rate # print the rate list

['5.0 out of 5 stars',


'4.0 out of 5 stars',
'4.0 out of 5 stars',
'5.0 out of 5 stars',
'1.0 out of 5 stars',
'5.0 out of 5 stars']

review = soup.find_all("span",{"data-hook":"review-body"}) # get list of all span tag elements having class 'review-body' using find_all
review # print the review list

[<span class="a-size-base review-text review-text-content" data-hook="review-body">


<span>screen battery processing speed style all are excellent. but photos colour segment highly changed comparing with real one.
if you ok with edited style images its a perfect cam for you else maybe you get little angry. but cam quality excellent.. I think
its a best mobile.!</span>
</span>,
<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>4 star</span>
</span>,
<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent
with all the other features</span>
</span>,
<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>Super</span>
</span>,
<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>Super phone thanks Amazon</span>
</span>,
<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor
brraleme 6 support 60fps in bgmi . Absolutely waste of money.</span>
</span>,
<span class="a-size-base review-text review-text-content" data-hook="review-body">
<span>very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone
need to buy this</span>
</span>]

review_content = [] # create an empty list named review_content


for i in range(0,len(review)):
review_content.append(review[i].get_text()) # append value of span tag in review_content list
review_content # print the review_content list

['\nscreen battery processing speed style all are excellent. but photos colour segment highly changed comparing with real one. if
you ok with edited style images its a perfect cam for you else maybe you get little angry. but cam quality excellent.. I think its
a best mobile.!\n',
'\n4 star\n',
'\nPersonally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features\n',
'\nSuper\n',
'\nSuper phone thanks Amazon\n',
'\nPlz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor
brraleme 6 support 60fps in bgmi . Absolutely waste of money.\n',
'\nvery good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this\n']

review_content[:] = [reviews.lstrip('\n') for reviews in review_content] # remove the left '\n' from all items of review_content list
review_content # print the review_content list

['screen battery processing speed style all are excellent. but photos colour segment highly changed comparing with real one. if you
ok with edited style images its a perfect cam for you else maybe you get little angry. but cam quality excellent.. I think its a
best mobile.!\n',
'4 star\n',
'Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features\n',
'Super\n',
'Super phone thanks Amazon\n',
'Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor brraleme
6 support 60fps in bgmi . Absolutely waste of money.\n',
'very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this\n']

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 5/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab
review_content.pop(0) # pop out 0th indexed item from review_content list

'screen battery processing speed style all are excellent. but photos colour segment
highly changed comparing with real one. if you ok with edited style images its a per
fect cam for you else maybe you get little angry. but cam quality excellent.. I thin

review_content[:] = [reviews.rstrip('\n') for reviews in review_content] # remove the right '\n' from all items of review_content list
review_content # print the review_content list

['4 star',
'Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features',
'Super',
'Super phone thanks Amazon',
'Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor brraleme
6 support 60fps in bgmi . Absolutely waste of money.',
'very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this']

len(review_content) # print the length/size of review_content items

# print cust_name , review_title, rate and review_content lists


cust_name
review_title
rate
review_content

['4 star',
'Personally I like light weight phone as it is convenient and this phone fits me better and also battery life is excellent with
all the other features',
'Super',
'Super phone thanks Amazon',
'Plz provide exchange option of this phone while buying other phone. It supports only 40fps gaming while its predecessor brraleme
6 support 60fps in bgmi . Absolutely waste of money.',
'very good phone for everyone.it is very useful battery life is also so good . never seen such a phone in my life .everyone need
to buy this']

import pandas as pd # import pandas library

df = pd.DataFrame() # create a dataframe df

df # print df

df['Customer Name']=cust_name # add column 'Customer Name' tp df using cust_name list

df # print df

Customer Name

0 Vellingkirinathan Kuppusamy

1 Sushma shilpi

2 S. Jayapaul

3 shivam kumar

4 Rajni Kumari

5 Kasqk masaka

print(len(review_content))
print(len(review_title))

6
7

# similarly add columns 'Review title', 'Ratings', 'Reviews to df using review_title, rate, review_content list respectively
df['Review title']=review_title
df['Ratings']=rate
df['Reviews']=review_content

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 6/7
4/25/24, 8:00 PM Web_Scrapping.ipynb - Colab

df

Customer Name Review title Ratings Reviews

Vellingkirinathan 5.0 out of 5


0 awesome mobile 4 star
Kuppusamy stars

4.0 out of 5 Personally I like light weight


1 Sushma shilpi Go for it it's a good choice
stars phone as it is ...

4.0 out of 5
2 S. Jayapaul Features best Super
stars

Super phone thanks 5.0 out of 5


3 shivam kumar Super phone thanks Amazon
Amazon stars

df.to_csv('reviews.csv',index=True) # save data to a file named 'reviews.csv'

https://colab.research.google.com/drive/1evqB1D-u3kPKwZaGBKvHeY-QDUn29fqK#printMode=true 7/7

You might also like