0% found this document useful (0 votes)
7 views7 pages

Dropdownlistscraping

The document contains a Scrapy spider implementation for scraping product details from the Lucky Pet website. It extracts product titles, links, images, stock status, prices, descriptions, brand names, and life stages, including handling variations based on dropdown selections. The spider uses Selenium for dynamic content loading and includes pagination handling to scrape multiple pages of products.

Uploaded by

amjadkhann621
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views7 pages

Dropdownlistscraping

The document contains a Scrapy spider implementation for scraping product details from the Lucky Pet website. It extracts product titles, links, images, stock status, prices, descriptions, brand names, and life stages, including handling variations based on dropdown selections. The spider uses Selenium for dynamic content loading and includes pagination handling to scrape multiple pages of products.

Uploaded by

amjadkhann621
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

import scrapy

import re
from w3lib.html import remove_tags

class LuckypetSpiderSpider(scrapy.Spider):
name = "luckypet_spider"
allowed_domains = ["www.luckypet.com.au"]
start_urls = ["https://www.luckypet.com.au/dog-shop/food-treats.html"]

def parse(self, response):


products = response.css('div.card.thumbnail.card-body')

for product in products:


Product_Title = product.css('p.card-title::text').get()

# Skip this product if title is null


if Product_Title is None:
continue

title_Link = product.css('a.thumbnail-image::attr(href)').get()

# Get image URL and convert to absolute URL


Image_url = product.css('img.product-image::attr(data-src)').get()
if Image_url:
Image_url = response.urljoin(Image_url)

Stock_Status = product.css('span.badge::text').get()

# Create item dictionary to store data


item = {
'Product_Title': Product_Title,
'Title_Link': title_Link,
'Image_url': Image_url,
'Stock_Status': Stock_Status
}

# Follow the product link to get additional details


yield scrapy.Request(
url=title_Link,
callback=self.parse_product_details,
meta={'item': item}
)

# Check for pagination and follow next page


next_page = response.css('a.page-link[aria-label="Go forward one
page"]::attr(href)').get()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)

def parse_product_details(self, response):


# Get the item from the request meta
item = response.meta['item']

# Extract Regular Price


price = response.css('span.txt-pur.large-price.font-weight-
bold::text').get()
if price:
item['Regular_Price'] = price.strip()
else:
item['Regular_Price'] = "Not Available"

# Extract Description and clean it


description_div = response.css('div.productdetails.n-responsive-content')
if description_div:
# Extract all text nodes from the description section
description_texts = description_div.xpath('.//text()').getall()
# Join all text and remove extra whitespace
description = ' '.join([text.strip() for text in description_texts if
text.strip()])
# Remove multiple spaces
description = re.sub(r'\s+', ' ', description)
item['Description'] = description.strip()
else:
item['Description'] = "Not Available"

# Extract Brand Name


brand_row =
response.xpath('//table[@class="table"]//tr[td[1]/strong[contains(text(),
"Brand")]]/td[2]/a/text()')
if brand_row:
item['Brand_name'] = brand_row.get().strip()
else:
item['Brand_name'] = "Not Available"

# Extract Life Stage


lifestage_row =
response.xpath('//table[@class="table"]//tr[td[1]/strong[contains(text(), "Life
Stage")]]/td[2]/text()')
if lifestage_row:
item['Lifestage'] = lifestage_row.get().strip()
else:
item['Lifestage'] = "Not Available"

# Return the final item


yield item
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import re

class LuckypetSpiderSpider(scrapy.Spider):
name = "luckypet_spider"
allowed_domains = ["www.luckypet.com.au"]
start_urls = ["https://www.luckypet.com.au/dog-shop/food-treats.html"]

custom_settings = {
'DOWNLOAD_DELAY': 3,
'RANDOMIZE_DOWNLOAD_DELAY': True,
'COOKIES_ENABLED': True,
'CONCURRENT_REQUESTS': 1
}

def __init__(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service(r"C:\\Users\\Hp\\.wdm\\drivers\\chromedriver\\win64\\
chromedriver-win64\\chromedriver.exe")
self.driver = webdriver.Chrome(service=service, options=chrome_options)
self.driver.set_window_size(1920, 1080)

def parse(self, response):


products = response.css('div.card.thumbnail.card-body')

for product in products:


Product_Title = product.css('p.card-title::text').get()

if Product_Title is None:
continue

title_Link = product.css('a.thumbnail-image::attr(href)').get()

Image_url = product.css('img.product-image::attr(data-src)').get()
if Image_url:
Image_url = response.urljoin(Image_url)

Stock_Status = product.css('span.badge::text').get()
item = {
'Product_Title': Product_Title,
'Title_Link': title_Link,
'Image_url': Image_url,
'Stock_Status': Stock_Status
}

yield scrapy.Request(
url=title_Link,
callback=self.parse_product_details,
meta={'item': item}
)

next_page = response.css('a.page-link[aria-label="Go forward one


page"]::attr(href)').get()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)

def wait_for_price_update(self, initial_price):


max_attempts = 10
wait_time = 1

for _ in range(max_attempts):
try:
price_element = WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.txt-
pur.large-price.font-weight-bold"))
)
current_price = price_element.text.strip()

# If price has changed from initial price, return the new price
if current_price != initial_price:
return current_price

time.sleep(wait_time)
except:
time.sleep(wait_time)
continue

return None

def parse_product_details(self, response):


item = response.meta['item']

self.driver.get(response.url)
time.sleep(3) # Wait for initial page load

try:
# Get initial price
initial_price_element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.txt-
pur.large-price.font-weight-bold"))
)
initial_price = initial_price_element.text.strip()

# First check if dropdown exists


dropdown = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
"select._itmspec_opt"))
)

# Scroll to dropdown
self.driver.execute_script("arguments[0].scrollIntoView(true);",
dropdown)
time.sleep(1)

# Get all option values and texts first


options_data = []
options = dropdown.find_elements(By.TAG_NAME, "option")
for option in options:
try:
value = option.get_attribute("value")
text = option.text.strip()
if value and text and "select" not in text.lower() and "option"
not in text.lower():
options_data.append({"value": value, "text": text})
except:
continue

# Now process each option


for option_data in options_data:
try:
# Refresh dropdown element for each iteration
dropdown = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
"select._itmspec_opt"))
)

# Use pure JavaScript to select option


select_script = f"""
var select = document.querySelector('select._itmspec_opt');

select.value = '{option_data["value"]}';
var event = new Event('change', {{ bubbles: true }});
select.dispatchEvent(event);
"""
self.driver.execute_script(select_script)

# Wait for price update


time.sleep(2)

# Wait for price to update and get new price


new_price = self.wait_for_price_update(initial_price)

if new_price:
print(f"Successfully scraped - Size: {option_data['text']},
Price: {new_price}")

# Create new item for this variation


variation_item = item.copy()
variation_item['Product_Title'] = f"{item['Product_Title']}
- {option_data['text']}"
variation_item['Size'] = option_data['text']
variation_item['Regular_Price'] = new_price

# Add other details


self.add_product_details(response, variation_item)
yield variation_item
else:
print(f"Price did not update for size:
{option_data['text']}")

except Exception as e:
print(f"Error processing option {option_data['text']}:
{str(e)}")
continue

except Exception as e:
print(f"No variations found or error: {str(e)}")
# Process as single product
try:
price_element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.txt-
pur.large-price.font-weight-bold"))
)
price = price_element.text.strip()
item['Regular_Price'] = price
except:
item['Regular_Price'] = "Not Available"

item['Size'] = "One Size"


self.add_product_details(response, item)
yield item

def add_product_details(self, response, item):


# Extract Description
description_div = response.css('div.productdetails.n-responsive-content')
if description_div:
description_texts = description_div.xpath('.//text()').getall()
description = ' '.join([text.strip() for text in description_texts if
text.strip()])
description = re.sub(r'\s+', ' ', description)
item['Description'] = description.strip()
else:
item['Description'] = "Not Available"

# Extract Brand Name


brand_row =
response.xpath('//table[@class="table"]//tr[td[1]/strong[contains(text(),
"Brand")]]/td[2]/a/text()')
if brand_row:
item['Brand_name'] = brand_row.get().strip()
else:
item['Brand_name'] = "Not Available"

# Extract Life Stage


lifestage_row =
response.xpath('//table[@class="table"]//tr[td[1]/strong[contains(text(), "Life
Stage")]]/td[2]/text()')
if lifestage_row:
item['Lifestage'] = lifestage_row.get().strip()
else:
item['Lifestage'] = "Not Available"

def closed(self, reason):


self.driver.quit()

You might also like