Issue
I am going to scrape the Pararius.nl as a practice with scrapy but when I start crawling it returns the fairlane protection, how can I pass it? Do I need any tools? please help with one example
def parse(self, response):
url = 'https://www.pararius.nl/{deal_type}/nederland/p-{page}/'
for deal_type in ['huurwoningen', 'koopwoningen']:
for i in range(1, 2):
yield scrapy.Request(url.format(deal_type=deal_type, page=i), callback=self.parse_pages,cookies=self.cookies,
headers=self.h, method='GET', cb_kwargs={'deal_type': deal_type})
def parse_pages(self, response, deal_type):
print(response.url)
return
Solution
I was able to paginate and collect data from the page without any issues.
as I see in your URL you using p-
but as I see on the website it should be page-
here is code I've using for it:
def parse(self, response):
url = 'https://www.pararius.nl/{deal_type}/nederland/page-{page}/'
# 'koopwoningen'
for deal_type in ['huurwoningen']:
for i in range(1, 2):
yield scrapy.Request(
url.format(deal_type=deal_type, page=i),
callback=self.parse_pages,
method='GET',
)
def parse_pages(self, response):
for row in response.css('li.search-list__item'):
href = row.css('a.listing-search-item__link--depiction::attr(href)').get()
if href:
yield {
'seacrh_url': response.url,
'url': 'https://pararius.nl' + href,
'title': row.xpath(
'.//div[contains(@class, "listing-search-item__sub-title\'")]/text()').get().strip()
}
page = response.meta.get('page', 2)
if 'page-' in response.url:
next_url = re.sub(r'page-\d+', f'page-{page}', response.url)
else:
next_url = response.url.strip('/') + f'/page-{page}'
page += 1
if page < 10:
yield scrapy.Request(
next_url,
callback=self.parse_pages,
method='GET',
meta={'page': page}
)
additional settings I've used:
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
ROBOTSTXT_OBEY = False
and this is example of result I've received:
{"seacrh_url": "https://www.pararius.nl/huurwoningen/nederland/page-2", "url": "https://pararius.nl/studio-te-huur/breda/60de7cf7/vredenburchstraat", "title": "4811 RD Breda (Boeimeer)"},
{"seacrh_url": "https://www.pararius.nl/huurwoningen/nederland/page-2", "url": "https://pararius.nl/appartement-te-huur/eindhoven/7035fc54/aalsterweg", "title": "5615 CH Eindhoven (Looiakkers)"},
{"seacrh_url": "https://www.pararius.nl/huurwoningen/nederland/page-2", "url": "https://pararius.nl/appartement-te-huur/rotterdam/a5eba2a3/herman-gorterstraat", "title": "3061 SM Rotterdam (Kralingen West)"},
for zah.nl
additional lib is required pip install undetected-chromedriver
you can use the following:
import re
import time
import scrapy
import undetected_chromedriver as UC
class ZahSpider(scrapy.Spider):
name = 'zah'
allowed_domains = ['www.zah.nl']
start_urls = ['https://www.zah.nl/te-koop/?page=1']
headers = {
'authority': 'www.zah.nl',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'uk-UA,uk;q=0.9,en-US;q=0.8,en;q=0.7',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
def start_requests(self):
options = uc.ChromeOptions()
driver = uc.Chrome(options=options)
driver.maximize_window()
driver.get('https://www.zah.nl/te-koop/?page=1')
time.sleep(5)
cookies_list = driver.get_cookies()
self.cookies_dict = {}
for cookie in cookies_list:
self.cookies_dict[cookie['name']] = cookie['value']
driver.quit()
yield scrapy.Request(
url='https://www.zah.nl/te-koop/?page=1',
cookies=self.cookies_dict,
headers=self.headers,
callback=self.parse
)
def parse(self, response):
for row in response.css('div.result'):
yield {
'title': row.css('a > h2::text').get(),
'url': row.css('a::attr(href)').get()
}
page = response.meta.get('page', 2)
if page < 5:
next_url = re.sub(r'page=\d+', f'page={page}', response.url)
page += 1
yield scrapy.Request(
url=next_url,
cookies=self.cookies_dict,
headers=self.headers,
meta={'page': page},
callback=self.parse
)
Answered By - Roman
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.