Issue
I'm trying to grab some data from the left-side column of a webpage. The aim is to click on all the show more
buttons using scrapy_playwright
, and grab the title of each the elements belonging to the show more
list. However, when I run my scraper it iterates the same header make
for all of the lists. I need to get these unique for each set of lists.
Here's my scraper:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
for i in range(0, 11):
yield scrapy.Request(
url = url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector=f"(//div[@class='toggle-bottom-filter'])[{i}]"),
PageCoroutine("wait_for_timeout", 5000),
]
),
)
def parse(self, response):
container = response.xpath("(//div[@id]//ul[@class='list-filter disp-bloc list-model1'])//li")
test= response.xpath("(//div[@class='elem-filter id_marque clearfix'])")
for items in container:
for values in test:
loader = ItemLoader(ConfusedItem(), selector = items)
loader.add_xpath('clicks', './/@onclick')
loader.add_value('category', values.xpath("(//h2[@class=' select-load select-off'])//text()").getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'json_data.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ConfusedSpider)
process.start()
Output:
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
Expected output:
{'category': 'SELLER TYPE',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
Solution
Your code has 2 issues. One your xpath selectors are not correct and two you are not using scrapy playwright therefore the clicks are not being done. Looping and changing the item index is not correct because once you click an item, that item is removed from the DOM and therefore the next item is now at the first index. Also, to enable scrapy-playwright
you need to have at least these additional settings:
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
I have corrected those two issues in the code below. You will need to add some error handling and also find a better way of knowing how many clicks you should enable in your code.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta=dict(
playwright=True,
playwright_page_coroutines=[
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[@class='toggle-bottom-filter']"),
]
),
)
def parse(self, response):
for category in response.xpath("//div[@id='face_links']/div"):
name = category.xpath("./h2/text()").get()
for item in category.xpath("./ul/li"):
loader = ItemLoader(ConfusedItem(), selector=item)
loader.add_xpath('clicks', './@onclick')
loader.add_value("category", name)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'json_data.jl',
'FEED_FORMAT': 'jsonlines',
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
"PLAYWRIGHT_BROWSER_TYPE": "webkit"
}
)
process.crawl(ConfusedSpider)
process.start()
Answered By - msenior_
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.