0% found this document useful (0 votes)

7 views7 pages

Dropdownlistscraping

The document contains a Scrapy spider implementation for scraping product details from the Lucky Pet website. It extracts product titles, links, images, stock status, prices, descriptions, brand names, and life stages, including handling variations based on dropdown selections. The spider uses Selenium for dynamic content loading and includes pagination handling to scrape multiple pages of products.

Uploaded by

amjadkhann621

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

7 views7 pages

Dropdownlistscraping

Uploaded by

amjadkhann621

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 7

import scrapy

import re
from w3lib.html import remove_tags

class LuckypetSpiderSpider(scrapy.Spider):
name = "luckypet_spider"
allowed_domains = ["www.luckypet.com.au"]
start_urls = ["https://www.luckypet.com.au/dog-shop/food-treats.html"]

def parse(self, response):

products = response.css('div.card.thumbnail.card-body')

for product in products:

Product_Title = product.css('p.card-title::text').get()

# Skip this product if title is null

if Product_Title is None:
continue

title_Link = product.css('a.thumbnail-image::attr(href)').get()

# Get image URL and convert to absolute URL

Image_url = product.css('img.product-image::attr(data-src)').get()
if Image_url:
Image_url = response.urljoin(Image_url)

Stock_Status = product.css('span.badge::text').get()

# Create item dictionary to store data

item = {
'Product_Title': Product_Title,
'Title_Link': title_Link,
'Image_url': Image_url,
'Stock_Status': Stock_Status
}

# Follow the product link to get additional details

yield scrapy.Request(
url=title_Link,
callback=self.parse_product_details,
meta={'item': item}
)

# Check for pagination and follow next page

next_page = response.css('a.page-link[aria-label="Go forward one
page"]::attr(href)').get()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)

def parse_product_details(self, response):

# Get the item from the request meta
item = response.meta['item']

# Extract Regular Price

price = response.css('span.txt-pur.large-price.font-weight-
bold::text').get()
if price:
item['Regular_Price'] = price.strip()
else:
item['Regular_Price'] = "Not Available"

# Extract Description and clean it

description_div = response.css('div.productdetails.n-responsive-content')
if description_div:
# Extract all text nodes from the description section
description_texts = description_div.xpath('.//text()').getall()
# Join all text and remove extra whitespace
description = ' '.join([text.strip() for text in description_texts if
text.strip()])
# Remove multiple spaces
description = re.sub(r'\s+', ' ', description)
item['Description'] = description.strip()
else:
item['Description'] = "Not Available"

# Extract Brand Name

brand_row =
response.xpath('//table[@class="table"]//tr[td[1]/strong[contains(text(),
"Brand")]]/td[2]/a/text()')
if brand_row:
item['Brand_name'] = brand_row.get().strip()
else:
item['Brand_name'] = "Not Available"

# Extract Life Stage

lifestage_row =
response.xpath('//table[@class="table"]//tr[td[1]/strong[contains(text(), "Life
Stage")]]/td[2]/text()')
if lifestage_row:
item['Lifestage'] = lifestage_row.get().strip()
else:
item['Lifestage'] = "Not Available"

# Return the final item

yield item
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import re

custom_settings = {
'DOWNLOAD_DELAY': 3,
'RANDOMIZE_DOWNLOAD_DELAY': True,
'COOKIES_ENABLED': True,
'CONCURRENT_REQUESTS': 1
}

def __init__(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service(r"C:\\Users\\Hp\\.wdm\\drivers\\chromedriver\\win64\\
chromedriver-win64\\chromedriver.exe")
self.driver = webdriver.Chrome(service=service, options=chrome_options)
self.driver.set_window_size(1920, 1080)

def parse(self, response):

products = response.css('div.card.thumbnail.card-body')

for product in products:

Product_Title = product.css('p.card-title::text').get()

if Product_Title is None:
continue

title_Link = product.css('a.thumbnail-image::attr(href)').get()

Image_url = product.css('img.product-image::attr(data-src)').get()
if Image_url:
Image_url = response.urljoin(Image_url)

Stock_Status = product.css('span.badge::text').get()
item = {
'Product_Title': Product_Title,
'Title_Link': title_Link,
'Image_url': Image_url,
'Stock_Status': Stock_Status
}

yield scrapy.Request(
url=title_Link,
callback=self.parse_product_details,
meta={'item': item}
)

next_page = response.css('a.page-link[aria-label="Go forward one

page"]::attr(href)').get()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)

def wait_for_price_update(self, initial_price):

max_attempts = 10
wait_time = 1

for _ in range(max_attempts):
try:
price_element = WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.txt-
pur.large-price.font-weight-bold"))
)
current_price = price_element.text.strip()

# If price has changed from initial price, return the new price
if current_price != initial_price:
return current_price

time.sleep(wait_time)
except:
time.sleep(wait_time)
continue

return None

def parse_product_details(self, response):

item = response.meta['item']

self.driver.get(response.url)
time.sleep(3) # Wait for initial page load

try:
# Get initial price
initial_price_element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.txt-
pur.large-price.font-weight-bold"))
)
initial_price = initial_price_element.text.strip()

# First check if dropdown exists

dropdown = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
"select._itmspec_opt"))
)

# Scroll to dropdown
self.driver.execute_script("arguments[0].scrollIntoView(true);",
dropdown)
time.sleep(1)

# Get all option values and texts first

options_data = []
options = dropdown.find_elements(By.TAG_NAME, "option")
for option in options:
try:
value = option.get_attribute("value")
text = option.text.strip()
if value and text and "select" not in text.lower() and "option"
not in text.lower():
options_data.append({"value": value, "text": text})
except:
continue

# Now process each option

for option_data in options_data:
try:
# Refresh dropdown element for each iteration
dropdown = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
"select._itmspec_opt"))
)

# Use pure JavaScript to select option

select_script = f"""
var select = document.querySelector('select._itmspec_opt');

select.value = '{option_data["value"]}';
var event = new Event('change', {{ bubbles: true }});
select.dispatchEvent(event);
"""
self.driver.execute_script(select_script)

# Wait for price update

time.sleep(2)

# Wait for price to update and get new price

new_price = self.wait_for_price_update(initial_price)

if new_price:
print(f"Successfully scraped - Size: {option_data['text']},
Price: {new_price}")

# Create new item for this variation

variation_item = item.copy()
variation_item['Product_Title'] = f"{item['Product_Title']}
- {option_data['text']}"
variation_item['Size'] = option_data['text']
variation_item['Regular_Price'] = new_price

# Add other details

self.add_product_details(response, variation_item)
yield variation_item
else:
print(f"Price did not update for size:
{option_data['text']}")

except Exception as e:
print(f"Error processing option {option_data['text']}:
{str(e)}")
continue

except Exception as e:
print(f"No variations found or error: {str(e)}")
# Process as single product
try:
price_element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.txt-
pur.large-price.font-weight-bold"))
)
price = price_element.text.strip()
item['Regular_Price'] = price
except:
item['Regular_Price'] = "Not Available"

item['Size'] = "One Size"

self.add_product_details(response, item)
yield item

def add_product_details(self, response, item):

# Extract Description
description_div = response.css('div.productdetails.n-responsive-content')
if description_div:
description_texts = description_div.xpath('.//text()').getall()
description = ' '.join([text.strip() for text in description_texts if
text.strip()])
description = re.sub(r'\s+', ' ', description)
item['Description'] = description.strip()
else:
item['Description'] = "Not Available"

# Extract Brand Name

# Extract Life Stage

def closed(self, reason):

self.driver.quit()

Test Bank For Management Information Systems, 12th Edition - Kenneth C. Laudon
100% (1)
Test Bank For Management Information Systems, 12th Edition - Kenneth C. Laudon
22 pages
Empowerment Technologies
100% (1)
Empowerment Technologies
5 pages
100 Best ChatGPT Prompts To Unleash AI's Potential - Metaverse Post
17% (6)
100 Best ChatGPT Prompts To Unleash AI's Potential - Metaverse Post
3 pages
Flipkart Web Scrapping
No ratings yet
Flipkart Web Scrapping
8 pages
Design Your: Profile Photo and Banner
No ratings yet
Design Your: Profile Photo and Banner
40 pages
Akamai Services
No ratings yet
Akamai Services
104 pages
App Bundle Apps List
No ratings yet
App Bundle Apps List
45 pages
Wordsmith Getting Started Guide Final
100% (1)
Wordsmith Getting Started Guide Final
8 pages
SFIST LMS Study Guide
100% (1)
SFIST LMS Study Guide
30 pages
Web-Based Information System With SMS Alert For Bureau of Fire Protection Sta. Ignacia
No ratings yet
Web-Based Information System With SMS Alert For Bureau of Fire Protection Sta. Ignacia
41 pages
IP Project File
No ratings yet
IP Project File
25 pages
Laravel Crud
No ratings yet
Laravel Crud
35 pages
Workshop 2B: Web Scraping With Beautifulsoup 4: Comp20008 Elements of Data Processing
No ratings yet
Workshop 2B: Web Scraping With Beautifulsoup 4: Comp20008 Elements of Data Processing
5 pages
Using Scrapy in PyCharm
100% (1)
Using Scrapy in PyCharm
8 pages
Main
No ratings yet
Main
9 pages
Power Query
No ratings yet
Power Query
1,329 pages
What Is A Newsletter
No ratings yet
What Is A Newsletter
2 pages
AI E Commerce Chatbot Report
No ratings yet
AI E Commerce Chatbot Report
3 pages
Aodb Eng
No ratings yet
Aodb Eng
48 pages
WEBSCRAping Buildwithpython
No ratings yet
WEBSCRAping Buildwithpython
78 pages
87 1
No ratings yet
87 1
10 pages
INTRODUTION TO HTML CODING FOR WEBSITE DESIGN - by Engr Peter A Ogbeide
No ratings yet
INTRODUTION TO HTML CODING FOR WEBSITE DESIGN - by Engr Peter A Ogbeide
45 pages
CS101 FIN-Term by Attiq Kundi-Updated On 11-08-2022
No ratings yet
CS101 FIN-Term by Attiq Kundi-Updated On 11-08-2022
69 pages
Ethernet Controller TCW240B R2.0
No ratings yet
Ethernet Controller TCW240B R2.0
35 pages
13 - Chapter 4 Technical Feasibility Study
No ratings yet
13 - Chapter 4 Technical Feasibility Study
24 pages
Project
No ratings yet
Project
25 pages
Haseeb 01 Unit 3 A Digital Portfolio Assignment 1 - Design Template
No ratings yet
Haseeb 01 Unit 3 A Digital Portfolio Assignment 1 - Design Template
22 pages
Web Scraping and Data Collection CheatSheet 1731972399
No ratings yet
Web Scraping and Data Collection CheatSheet 1731972399
10 pages
Web Scraping
No ratings yet
Web Scraping
11 pages
Beginner Guide To Web Scraping of Data
No ratings yet
Beginner Guide To Web Scraping of Data
14 pages
77 Final
No ratings yet
77 Final
24 pages
Web Scrape For Barcodes
No ratings yet
Web Scrape For Barcodes
9 pages
Python
No ratings yet
Python
18 pages
Vulnerability Scan: Prepared by
No ratings yet
Vulnerability Scan: Prepared by
17 pages
Amazon Script
No ratings yet
Amazon Script
15 pages
GC 2024 07 30
No ratings yet
GC 2024 07 30
21 pages
How To Scrape Product Data From Amazon - A Complete Guide - Oxylabs
No ratings yet
How To Scrape Product Data From Amazon - A Complete Guide - Oxylabs
19 pages
Python PPT
No ratings yet
Python PPT
27 pages
150 ChatGPT Prompts PDF
90% (10)
150 ChatGPT Prompts PDF
10 pages
Notepad
No ratings yet
Notepad
16 pages
Vnprod
No ratings yet
Vnprod
33 pages
Test Cases
No ratings yet
Test Cases
12 pages
Web Scrapping Project Phase 4 1679950739
No ratings yet
Web Scrapping Project Phase 4 1679950739
12 pages
Python Programming
No ratings yet
Python Programming
11 pages
B - 2 CIE Web Scraping
No ratings yet
B - 2 CIE Web Scraping
8 pages
Cabico Tan
No ratings yet
Cabico Tan
11 pages
Basic Scraping Techniques
No ratings yet
Basic Scraping Techniques
7 pages
Course No.: CS-566 Course Title: Web Technologies Total Marks: 12 Date of Exams: Degree: BSCS Semester: 5 Section: A, B 1 2 3 4 5 6 7 8 9 10
No ratings yet
Course No.: CS-566 Course Title: Web Technologies Total Marks: 12 Date of Exams: Degree: BSCS Semester: 5 Section: A, B 1 2 3 4 5 6 7 8 9 10
9 pages
Product Info Scrapper
No ratings yet
Product Info Scrapper
18 pages
Code of The Project
No ratings yet
Code of The Project
10 pages
Rate Analogy
No ratings yet
Rate Analogy
9 pages
Lab Manual - Exp 1 (STA)
No ratings yet
Lab Manual - Exp 1 (STA)
5 pages
Scrapy Beginners Series Part 2 - Cleaning & Processing Data - ScrapeOps
No ratings yet
Scrapy Beginners Series Part 2 - Cleaning & Processing Data - ScrapeOps
10 pages
Variationmustprompt
No ratings yet
Variationmustprompt
11 pages
Lang Chain Agent
No ratings yet
Lang Chain Agent
9 pages
Benchmaster Documentation
No ratings yet
Benchmaster Documentation
12 pages
TOYOTA Information System
No ratings yet
TOYOTA Information System
6 pages
Web - Scrapping - Ipynb - Colab
No ratings yet
Web - Scrapping - Ipynb - Colab
7 pages
Web Scraping Assignment Ebay
No ratings yet
Web Scraping Assignment Ebay
6 pages
Hybrid Scraping Techniques
No ratings yet
Hybrid Scraping Techniques
8 pages
Amazon
No ratings yet
Amazon
5 pages
DH
No ratings yet
DH
4 pages
Assessment Task - Carbon38
No ratings yet
Assessment Task - Carbon38
5 pages
Directory Structure
No ratings yet
Directory Structure
10 pages
A Research About Cookies and Session PDF
No ratings yet
A Research About Cookies and Session PDF
2 pages
Project Py PDF
No ratings yet
Project Py PDF
6 pages
Context
No ratings yet
Context
8 pages
84 3
No ratings yet
84 3
10 pages
Project
No ratings yet
Project
4 pages
How To Disable Remote Firmware Upgrades - c03154309 - HP Business Support Center
No ratings yet
How To Disable Remote Firmware Upgrades - c03154309 - HP Business Support Center
4 pages
Arun Kumar Upadhyay: Contact
No ratings yet
Arun Kumar Upadhyay: Contact
3 pages
Push-Up Fest Workout
No ratings yet
Push-Up Fest Workout
4 pages
6
No ratings yet
6
3 pages
3.1 Reselling - Code
No ratings yet
3.1 Reselling - Code
2 pages
Step 2
No ratings yet
Step 2
2 pages
Step 3
No ratings yet
Step 3
2 pages
MR Brico Url
No ratings yet
MR Brico Url
2 pages
Python Scrapping Task
No ratings yet
Python Scrapping Task
2 pages
mc15470 Web
No ratings yet
mc15470 Web
2 pages
Script Scrapping
No ratings yet
Script Scrapping
2 pages
Demo
No ratings yet
Demo
2 pages
Web Dynpro For Abap Applications and Pages, BSP Applications, Sap Gui Transactions, Webui (CRM), Url Pages and Iviews
No ratings yet
Web Dynpro For Abap Applications and Pages, BSP Applications, Sap Gui Transactions, Webui (CRM), Url Pages and Iviews
1 page
Naukri SudikshyaNayak (5y 5m)
No ratings yet
Naukri SudikshyaNayak (5y 5m)
1 page
Web Scraping
No ratings yet
Web Scraping
2 pages
10 Lessons in Front-end
From Everand
10 Lessons in Front-end
Krasimir Tsonev
2/5 (1)
NgRx SignalStore: An effortless solution for state management
From Everand
NgRx SignalStore: An effortless solution for state management
Abdelfattah Ragab
No ratings yet
MCTS 70-515 Exam: Web Applications Development with Microsoft .NET Framework 4 (Exam Prep)
From Everand
MCTS 70-515 Exam: Web Applications Development with Microsoft .NET Framework 4 (Exam Prep)
Eddie Vi
4/5 (1)
Angular Generative AI: Building an intelligent CV enhancer with Google Gemini
From Everand
Angular Generative AI: Building an intelligent CV enhancer with Google Gemini
Abdelfattah Ragab
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
How to a Developers Guide to 4k: Developer edition, #3
From Everand
How to a Developers Guide to 4k: Developer edition, #3
Xinc Cyberwizard
No ratings yet