Best Python code snippet using playwright-python
main.py
Source: main.py
...77 await page.wait_for_timeout(1000)78 items: List[Dict[str, Any]] = []79 if input.is_exact is True:80 # loop through the items and try to match the name81 titles = await page.eval_on_selector_all(82 '//div[@class="buy-box"]/div[@class="left"]/p',83 '(boxes) => boxes.map(box => box.innerText)',84 )85 distances = [distance(input.query, x) for x in titles]86 item_matches = sorted(zip(titles, distances, range(1, len(titles) + 1)), key=lambda x: x[1])87 print(item_matches)88 if len(item_matches) > 0:89 # select the closest one90 button_selector = f'(//div[@class="buy-box"])[{item_matches[0][2]}]/*[contains(@class, "button")]'91 button_class_dict: Dict[str, str] = await page.eval_on_selector(92 button_selector,93 '(button) => button.classList',94 )95 if 'custom' not in button_class_dict.values():96 # just click on the button97 await page.click(button_selector, click_count=input.quantity)98 item_title = await page.eval_on_selector(99 '(//div[@class="buy-box"])[1]/div[@class="left"]/p',100 '(title) => title.innerText',101 )102 items.append({'name': item_title, 'quantity': input.quantity})103 else:104 # go into customization105 await page.click(button_selector)106 await page.wait_for_timeout(500)107 # get the first sub-item108 item_title = await page.eval_on_selector(109 '(//h1[@class="title"])[1]',110 '(title) => title.innerText',111 )112 await page.fill('//input[contains(@class, "count")]', str(input.quantity))113 await page.wait_for_timeout(500)114 await page.click('button.to-cart')115 await page.wait_for_timeout(1000)116 await page.goto('https://mcd.cn/product')117 # await page.click('div:text("é¤åèå")')118 items.append({'name': item_title, 'quantity': input.quantity})119 else:120 result_count = await page.eval_on_selector_all(121 '//div[@class="buy-box"]/div[@class="left"]/p',122 '(boxes) => boxes.length',123 )124 # pick one at random125 item_idx = random.sample(range(1, result_count + 1), input.quantity)126 for idx in item_idx:127 button_selector = f'(//div[@class="buy-box"])[{idx}]/*[contains(@class, "button")]'128 button_class_dict: Dict[str, str] = await page.eval_on_selector(129 button_selector,130 '(button) => button.classList',131 )132 if 'custom' not in button_class_dict.values():133 # just click on the button134 await page.click(button_selector, click_count=1)135 item_title = await page.eval_on_selector(136 '(//div[@class="buy-box"])[1]/div[@class="left"]/p',137 '(title) => title.innerText',138 )139 items.append({'name': item_title, 'quantity': 1})140 else:141 # go into customization142 await page.click(button_selector)143 await page.wait_for_timeout(500)144 # get the first sub-item145 item_title = await page.eval_on_selector(146 '(//h1[@class="title"])[1]',147 '(title) => title.innerText',148 )149 await page.fill('//input[contains(@class, "count")]', '1')150 await page.wait_for_timeout(500)151 await page.click('button.to-cart')152 await page.wait_for_timeout(1000)153 items.append({'name': item_title, 'quantity': 1})154 # await page.click('div:text("é¤åèå")')155 await page.goto('https://mcd.cn/product')156 await page.wait_for_timeout(2000)157 await page.fill('//input[contains(@placeholder, "æå¤")]', input.query)158 await page.click('button.ant-input-search-button')159 await page.wait_for_timeout(1000)160 state = 'ordered'161 metadata = {'items': items}162 sessions[session_id] = (session, browser, page, state)163 return SessionOutput(id=session_id, state=state, metadata=metadata)164 except KeyError:165 raise HTTPException(state_code=404, detail='session not found')166@app.post('/sessions/{session_id}/cart/clear', response_model=SessionOutput)167async def clear_session_cart(session_id: str):168 try:169 session, browser, page, state = sessions[session_id]170 await page.click('//div[@class="car"]')171 await page.wait_for_timeout(500)172 await page.click('span:text("æ¸
空è´ç©è½¦")')173 await page.wait_for_timeout(500)174 await page.click('//div[@class="ant-popover-buttons"]/button[contains(@class, "ant-btn-primary")]')175 state = 'cart_cleared'176 sessions[session_id] = (session, browser, page, state)177 return SessionOutput(id=session_id, state=state)178 except KeyError:179 raise HTTPException(state_code=404, detail='session not found')180@app.get('/sessions/{session_id}/cart', response_model=SessionOutput)181async def get_session_cart(session_id: str):182 try:183 session, browser, page, state = sessions[session_id]184 cart_price_texts = await page.eval_on_selector_all(185 '//div[@class="price-info"]/span',186 '(spans) => spans.map((span) => span.innerText)',187 )188 address_texts = await page.eval_on_selector_all(189 '//div[@class="othpart address"]/div[@class="center"]/div',190 '(spans) => spans.map((span) => span.innerText)',191 )192 deliver_time_texts = await page.eval_on_selector_all(193 '//div[@class="othpart time"]/div[@class="center"]/div',194 '(spans) => spans.map((span) => span.innerText)',195 )196 checkout_button_class_dict: Dict[str, str] = await page.eval_on_selector(197 '//button[contains(@class, "to-check")]',198 '(button) => button.classList',199 )200 if 'grey' in checkout_button_class_dict.values():201 state = 'cart_empty'202 sessions[session_id] = (session, browser, page, state)203 return SessionOutput(204 id=session_id,205 state=state,206 metadata={207 'items': [],208 'cart_price_texts': cart_price_texts,209 'address_texts': address_texts,210 'deliver_time_texts': deliver_time_texts,211 },212 )213 else:214 await page.click('//div[@class="car"]')215 await page.wait_for_timeout(500)216 item_titles = await page.eval_on_selector_all(217 '//div[contains(@class, "cart-panel-details")]/div[@class="main"]/div/div/div[@class="name"]',218 '(titles) => titles.map(title => title.innerText)',219 )220 item_quantities = await page.eval_on_selector_all(221 '//div[contains(@class, "cart-panel-details")]/div[@class="main"]/div/div/div[@class="count-panel"]/div/input',222 '(quantities) => quantities.map(q => q.value)',223 )224 await page.wait_for_timeout(500)225 await page.click('//div[@class="close"]')226 state = 'cart_viewed'227 sessions[session_id] = (session, browser, page, state)228 return SessionOutput(229 id=session_id,230 state=state,231 metadata={232 'items': list(map(lambda item: { "name": item[0], "quantity": item[1] }, zip(item_titles, item_quantities))),233 'cart_price_texts': cart_price_texts,234 'address_texts': address_texts,...
test_queryselector.py
Source:test_queryselector.py
...21 page = await context.new_page()22 await page.set_content("<div><span></span></div><div></div>")23 assert await page.eval_on_selector("tag=DIV", "e => e.nodeName") == "DIV"24 assert await page.eval_on_selector("tag=SPAN", "e => e.nodeName") == "SPAN"25 assert await page.eval_on_selector_all("tag=DIV", "es => es.length") == 226 assert await page.eval_on_selector("tag2=DIV", "e => e.nodeName") == "DIV"27 assert await page.eval_on_selector("tag2=SPAN", "e => e.nodeName") == "SPAN"28 assert await page.eval_on_selector_all("tag2=DIV", "es => es.length") == 229 # Selector names are case-sensitive.30 with pytest.raises(Error) as exc:31 await page.query_selector("tAG=DIV")32 assert 'Unknown engine "tAG" while parsing selector tAG=DIV' in exc.value.message33 await context.close()34async def test_selectors_register_should_work_with_path(35 selectors, page: Page, utils, assetdir36):37 await utils.register_selector_engine(38 selectors, "foo", path=assetdir / "sectionselectorengine.js"39 )40 await page.set_content("<section></section>")41 assert await page.eval_on_selector("foo=whatever", "e => e.nodeName") == "SECTION"42async def test_selectors_register_should_work_in_main_and_isolated_world(43 selectors, page: Page, utils44):45 dummy_selector_script = """{46 create(root, target) { },47 query(root, selector) {48 return window.__answer;49 },50 queryAll(root, selector) {51 return window['__answer'] ? [window['__answer'], document.body, document.documentElement] : [];52 }53 }"""54 await utils.register_selector_engine(selectors, "main", dummy_selector_script)55 await utils.register_selector_engine(56 selectors, "isolated", dummy_selector_script, content_script=True57 )58 await page.set_content("<div><span><section></section></span></div>")59 await page.evaluate('() => window.__answer = document.querySelector("span")')60 # Works in main if asked.61 assert await page.eval_on_selector("main=ignored", "e => e.nodeName") == "SPAN"62 assert (63 await page.eval_on_selector("css=div >> main=ignored", "e => e.nodeName")64 == "SPAN"65 )66 assert await page.eval_on_selector_all(67 "main=ignored", "es => window.__answer !== undefined"68 )69 assert (70 await page.eval_on_selector_all(71 "main=ignored", "es => es.filter(e => e).length"72 )73 == 374 )75 # Works in isolated by default.76 assert await page.query_selector("isolated=ignored") is None77 assert await page.query_selector("css=div >> isolated=ignored") is None78 # $$eval always works in main, to avoid adopting nodes one by one.79 assert await page.eval_on_selector_all(80 "isolated=ignored", "es => window.__answer !== undefined"81 )82 assert (83 await page.eval_on_selector_all(84 "isolated=ignored", "es => es.filter(e => e).length"85 )86 == 387 )88 # At least one engine in main forces all to be in main.89 assert (90 await page.eval_on_selector(91 "main=ignored >> isolated=ignored", "e => e.nodeName"92 )93 == "SPAN"94 )95 assert (96 await page.eval_on_selector(97 "isolated=ignored >> main=ignored", "e => e.nodeName"...
TechJuice-xpath-Task.py
Source: TechJuice-xpath-Task.py
...20 Title = quote.query_selector('xpath=div/h2/a').inner_text()21 print('Title : ', Title )22 Description = quote.query_selector('xpath=div/p').inner_text()23 print('Description : ', Description )24 hrefs_of_page = quote.eval_on_selector_all(".text-dark", "elements => elements.map(element => element.href)")25 str=""26 string_format_href = str.join(hrefs_of_page)27 article_url.append(string_format_href)28 print('hrefs_of_page : ', string_format_href )29 dic = {}30 dic['Title','Description','Url'] = Title,Description,string_format_href31 list_dic.append(dic)32 with open('Tech-Juice-xpath.csv', 'a', newline='') as outcsv:33 writer = csv.writer(outcsv)34 writer.writerow([Title,string_format_href,Description])35 print("\n")36 except:37 Title = "Not Found"38 print('Title : ', Title )39 print('Description : ', Description )40 print('hrefs_of_page : ', hrefs_of_page )41 42 print('Article_url : ',article_url)43 page.wait_for_timeout((2000))44 for i in article_url:45 Article_page = browser.new_page()46 Article_page.goto(i)47 page.wait_for_timeout((3000))48 Article_Heading = Article_page.query_selector('xpath= //html/body/div/div/div/div/div/div/h1').inner_html()49 print('Article_Heading : ',Article_Heading)50 written_by = Article_page.query_selector('xpath= //html/body/div/div/div/div/div/div/div/small/a/span').inner_html() 51 print("Written By --",written_by)52 # Article_Des = Article_page.query_selector('xpath= //html/body/div/div/div[2]/div/div/div/article/p/span').inner_html() 53 # print('Article_Description : ',Article_Des)54 # with open('Tech-Juice-xpath.csv', 'a', newline='') as outcsv:55 # writer = csv.writer(outcsv)56 # writer.writerow(["","","",written_by,Article_Heading,Article_Des])57 58 Article_page.close()59if __name__ == '__main__':60 main('https://www.techjuice.pk/')61 62# blog = page.query_selector_all('site-content')63# blog = page.query_selector_all('.col-md-9')64# blog = page. query_selector_all('.site-content > .row > div:first-child .col-md-9')65# blog = page.query_selector_all('xpath= //html/body/div/div[1]/div/div') 66# # t = quote.query_selector('xpath=div/h2/a[href].href')...
19-list.py
Source: 19-list.py
...33 # list_dic = []34 # article_url = []35 # for quote in blog:36 # # # t = quote.query_selector('xpath=div/h2/a[href].href')37 # # t = quote.eval_on_selector_all(".text-dark", "elements => elements.map(element => element.href)")38 39 # try:40 # Title = quote.query_selector('xpath=div/h2/a').inner_text()41 # print('Title : ', Title )42 # Description = quote.query_selector('xpath=div/p').inner_text()43 # print('Description : ', Description )44 # hrefs_of_page = quote.eval_on_selector_all(".text-dark", "elements => elements.map(element => element.href)")45 # str=""46 # string_format_href = str.join(hrefs_of_page)47 # article_url.append(string_format_href)48 # print('hrefs_of_page : ', string_format_href )49 # dic = {}50 # dic['Title','Description','Url'] = Title,Description,string_format_href51 # list_dic.append(dic)52 # print("\n")53 # except:54 # Title = "Not Found"55 # print('Title : ', Title )56 # print('Description : ', Description )57 # print('hrefs_of_page : ', hrefs_of_page )58 ...
how to scrape video media with scrapy or other python's libraries?
How to check for element existence without getting an error in Playwright
Using Python with Playwright, how to get the value of an element?
Python Playwright make code reload page after timeout until it finds the object
Playwright won't navigate to URL (Python)
How to find element by attribute and text in a singe locator?
Scrape info from popup window with Playwright in Python and store in pandas df
playwright._impl._api_types.Error: Evaluation failed: cyclic object value when retriving dataLayer variable - Playwright-Python
Why does Playwright not change url once button is clicked on Uber Eats?
Launch persistent context from current directory in playwright
Here is a fully working solution by using scrapy-playwright
, I got the idea the following issue 61 and the users profile @lime-n.
We download the xhr
requests sent and store these into a dict with both the playwright
tools and scrapy-playwright
. I include the playwright_page_event_handlers
to integrate playwright
tools for this.
Depending on the video size, it will take a few minutes to download.
main.py
import scrapy
from playwright.async_api import Response as PlaywrightResponse
import jsonlines
import pandas as pd
import json
from video.items import videoItem
headers = {
'authority': 'api.equidia.fr',
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'content-type': 'application/json',
'origin': 'https://www.equidia.fr',
'referer': 'https://www.equidia.fr/',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
class videoDownloadSpider(scrapy.Spider):
name = 'video'
start_urls = ['https://www.equidia.fr/courses/2022-07-31/R1/C1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
#callback = self.parse,
meta = {
'playwright':True,
'playwright_page_event_handlers':{
"response": "handle_response",
}
}
)
async def handle_response(self, response: PlaywrightResponse) -> None:
"""
We can grab the post data with response.request.post - there are three different types for different needs.
The method below helps grab those resource types of 'xhr' and 'fetch' until I can work out how to only send these to the download request.
"""
self.logger.info(f'test the log of data: {response.request.resource_type, response.request.url, response.request.method}')
jl_file = "videos.jl"
data = {}
if response.request.resource_type == "xhr":
if response.request.method == "GET":
if 'videos' in response.request.url:
data['resource_type']=response.request.resource_type,
data['request_url']=response.request.url,
data['method']=response.request.method
with jsonlines.open(jl_file, mode='a') as writer:
writer.write(data)
def parse(self, response):
video = pd.read_json('videos.jl', lines=True)
print('KEST: %s' % video['request_url'][0][0])
yield scrapy.FormRequest(
url = video['request_url'][0][0],
headers=headers,
callback = self.parse_video_json
)
def parse_video_json(self, response):
another_url = response.json().get('video_url')
yield scrapy.FormRequest(
another_url,
headers=headers,
callback=self.extract_videos
)
def extract_videos(self, response):
videos = response.json().get('mp4')
for keys, vids in videos.items():
loader = videoItem()
loader['video'] = [vids]
yield loader
items.py
class videoItem(scrapy.Item):
video = scrapy.Field()
pipelines.py
class downloadFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, item=None):
file = request.url.split('/')[-1]
video_file = f"{file}.mp4"
return video_file
settings.py
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
FILES_STORE = os.path.join(BASE_DIR, 'videos')
FILES_URLS_FIELD = 'video'
FILES_RESULT_FIELD = 'results'
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
ITEM_PIPELINES = {
'video.pipelines.downloadFilesPipeline': 150,
}
Check out the latest blogs from LambdaTest on this topic:
Open MCT is a next-generation mission control framework for data visualization on desktop and mobile devices. It was created at NASA’s Ames Research Center, and NASA uses it to analyze spacecraft mission data.
It’s essential to test all components of your website to see if they work as expected. Playwright’s end to end testing capability helps you achieve this easily. However, if you’re comfortable using Python, you can pair it with the Playwright testing framework to run Python end to end testing on your website.
In today’s data-driven world, the ability to access and analyze large amounts of data can give researchers, businesses & organizations a competitive edge. One of the most important & free sources of this data is the Internet, which can be accessed and mined through web scraping.
The year 2021 can be encapsulated as one major transition. In 2022, the current breakthroughs in the elusive fight to eliminate the COVID-19 pandemic are top of mind for enterprises globally. At the same time, we are witnessing recent strides in technological advancements as the world gets digitized. As a result, the year 2022 will see the resumption of massive changes in technology and digital transformation, driving firms to adapt and transform themselves perpetually.
Playwright is a framework that I’ve always heard great things about but never had a chance to pick up until earlier this year. And since then, it’s become one of my favorite test automation frameworks to use when building a new automation project. It’s easy to set up, feature-packed, and one of the fastest, most reliable frameworks I’ve worked with.
LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.
Get 100 minutes of automation test minutes FREE!!