Issue
I have the following code. It opens up the headless browser and I also see the page getting scrolled but the response object in parse method doesn't have any HTML. When I don't use auto-scrolling this spider works perfectly.
The code is supposed only to extract the product name and product price from this website.
import scrapy
import re
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
def should_abort_request(req):
if req.resource_type == "image":
return True
if req.method.lower() == 'post':
return True
return False
scrolling_script = """
const scrolls = 8
let scrollCount = 0
// scroll down and then wait for 5s
const scrollInterval = setInterval(() => {
window.scrollTo(0, document.body.scrollHeight)
scrollCount++
if (scrollCount === numScrolls) {
clearInterval(scrollInterval)
}
}, 5000)
"""
class AuchanSpider(scrapy.Spider):
name = 'auchan'
custom_settings = {
'PLAYWRIGHT_ABORT_REQUEST': should_abort_request
}
start_urls = ['https://zakupy.auchan.pl/shop/list/8029?shType=id']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
PageMethod("evaluate", scrolling_script),
#PageMethod("wait_for_timeout", 30000),
PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_"),
PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_:nth-child(60)")
],
},
errback=self.close_page,
cb_kwargs=dict(main_url=url, page_number=0),
)
async def parse(self, response, main_url, page_number):
soup = BeautifulSoup(response.text, 'html.parser')
product_containers = soup.find_all('div', class_='_1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_')
for product_container in product_containers:
price = product_container.find(class_='_1-UB _1Evs').get_text()
price = re.sub(r"[\n\t\s]*", "", price)
yield {
'productName': product_container.find(class_='_1DGZ').get_text(),
'price': price
}
async def close_page(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
Solution
I'd approach this problem a bit more directly than you're doing. There's no need for BeautifulSoup because Playwright can already select elements on the live page. I'm not sure whether Scrapy is necessary, either, but you can adapt the following Playwright code to Scrapy if you want:
import re
from playwright.sync_api import sync_playwright # 1.37.0
from time import sleep
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
url = "https://zakupy.auchan.pl/shop/list/8029?shType=id"
page.goto(url)
page.click("#onetrust-accept-btn-handler")
page.click("._3YI0")
text = page.locator("._3MDH").text_content().strip()
expected = int(re.search(r"\d+$", text).group())
records = {}
while len(records) < expected:
page.keyboard.press("PageDown")
sleep(0.2) # save a bit of CPU
items = page.eval_on_selector_all(
"._1DGZ",
"""els => els.map(e => ({
href: e.href,
text: e.textContent,
}))""",
)
for x in items:
# assume hrefs are unique
records[x["href"]] = x
print(records)
browser.close()
This code dismisses the cookie and ad banners, then presses PageDown until there are no more records to fetch. I'm only pulling out the title and link from the DOM, but you can add more information if you want.
Note that I'm using simpler selectors. The more assumptions in the selector, the more likely it'll fail if any don't hold true. In your case, although the issue was using spaces instead of .
to identify multiple classes on one element (spaces denote ancestors), not using so many classes in the first place would have avoided confusion. Sanity check your selectors in the browser console first, keeping in mind this doesn't guarantee they'll work in Playwright, given the different environment. The browser can generate sample selectors. Although these are usually overly-specific, they're at least valid and can be refined to be more reliable.
Also, I realize it's probably better to use the text on the bottom of the page "ZaĆadowano 361 produkt(y) na 361" to determine when all records have been scraped, but I'll leave that as an exercise.
Another approach is to intercept requests rather than scrape the document, which provides a lot more data (~2 MB for the page provided):
import json
from playwright.sync_api import sync_playwright
from time import sleep
def scrape(page):
url = "https://zakupy.auchan.pl/shop/list/8029?shType=id"
items = []
done = False
def handle_response(response):
nonlocal done
api_url = "https://zakupy.auchan.pl/api/v2/cache/products"
if response.url.startswith(api_url):
data = response.json()
items.append(data)
if data["pageCount"] == data["currentPage"]:
with open("out.json", "w") as f:
json.dump(items, f)
done = True
page.on("response", handle_response)
page.goto(url)
page.click("#onetrust-accept-btn-handler")
page.click("._3YI0")
while not done:
page.keyboard.press("PageDown")
sleep(0.2) # save a bit of CPU
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
scrape(browser.new_page())
browser.close()
You can then loop over the JSON with jq to extract whatever information you want, say, the names:
jq '.[].results | .[] | .defaultVariant.name' < out.json
or in Python:
for x in items:
for x in x["results"]:
print(x["defaultVariant"]["name"])
with a list comp:
[x["defaultVariant"]["name"] for y in items for x in y["results"]]
Note that the above version misses the first page of records, which can be grabbed from the DOM or with a separate request that uses headers copied from another API request.
However, once you go into request interception territory, you can hijack a request to their API and wire it to return 500 items, collecting all of the data more quickly and easily:
import json
from playwright.sync_api import sync_playwright
from time import sleep
def scrape(page):
url = "https://zakupy.auchan.pl/shop/list/8029?shType=id"
api_url = "https://zakupy.auchan.pl/api/v2/cache/products"
new_url = "https://zakupy.auchan.pl/api/v2/cache/products?listId=8029&itemsPerPage=500&page=1&cacheSegmentationCode=019_DEF&hl=pl"
done = False
def handle(route, request):
route.continue_(url=new_url)
page.route("https://zakupy.auchan.pl/api/v2/cache/products*", handle)
def handle_response(response):
nonlocal done
if response.url.startswith(api_url):
with open("out1.json", "w") as f:
json.dump(response.json(), f)
done = True
page.on("response", handle_response)
page.goto(url)
page.click("#onetrust-accept-btn-handler")
page.click("._3YI0")
while not done:
page.keyboard.press("PageDown")
sleep(0.2) # save a bit of CPU
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
scrape(browser.new_page())
browser.close()
This structure can be processed as follows, using the example of grabbing names:
jq '.results | .[] | .defaultVariant.name' < out1.json
or in Python:
for x in data["results"]:
print(x["defaultVariant"]["name"])
Answered By - ggorlen
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.