Issue
I am trying to scrape data from real estate site https://www.spitogatos.gr/. I saw from robots.txt there is that: The Ultimate robots.txt Bot and User-Agent Blocker I just want to scrape the site every day one time, it's a way around to scrape using scrapy? thank you in advance
import scrapy
class MainprojectSpider(scrapy.Spider):
name = 'mainProject'
allowed_domains = ['www.spitogatos.gr']
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/91.0.4472.124 Safari/537.36'
#start_urls = ['https://www.spitogatos.gr/']
def start_requests(self):
yield scrapy.Request(url='https://www.spitogatos.gr', callback = self.parse,
headers= {'User Agent':self.user_agent})
def parse(self, response):
print(response.xpath('//h2[@class="text thin h1"]/text()').extract())#just dummy
def set_user_agent(self, request):
request.headers['User-Agent'] = self.user_agent
return request
Solution
Passing only User-Agent in headers is not sufficient to get successful response. You have to pass the complete request header and cookies for this website.
Code
import scrapy
class MainprojectSpider(scrapy.Spider):
name = 'mainProject'
allowed_domains = ['spitogatos.gr']
headers = {
"authority": "www.spitogatos.gr",
"pragma": "no-cache",
"cache-control": "no-cache",
"sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"",
"sec-ch-ua-mobile": "?0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en-US,en;q=0.9"
}
cookies = {
"PHPSESSID": "po638herud8fh5bcd5faj6e2fn",
"spitogatosHomepageMap": "0",
"currentCurrency": "EUR",
"_ga": "GA1.2.995321575.1625941672",
"_gid": "GA1.2.2083505143.1625941672",
"_hjTLDTest": "1",
"_hjid": "81d5a4f5-1d68-4f26-86c9-be4f6f3c4072",
"_fbp": "fb.1.1625941672273.1445941947",
"__qca": "P0-96032413-1625941956393",
"_hjAbsoluteSessionInProgress": "1",
"openedTabs": "1",
"_gat_UA-3455846-10": "1",
"_gat_UA-3455846-2": "1",
"_hjIncludedInSessionSample": "1",
"reese84": "3:FGNSnDE4wRMItXmgo8P+Aw==:sGtH84yEJkKj63PngFcdU3iQbhkp11cYkDw3X06dMlyaUb7wTkc2Wah9Qovgk4eW/Gg34paBwJIFH5ywVR4iJmb+542uPLVNXHnd4LXLKtVOTdeLrew41lAeyvKyjAcHlsIW+El8j8715RwI9TirIOa50wILShhQbubz89vw4m4rSnNrNbI73GMNWQZBIaSG3Lct5PuBfdJjdl3rT4Fp1kR7dV/yggGv1e6T33RgojXdT23MRb9uG7TojFqiIlI75yqZ8XdxqsSDClwWq8b/vFbUagC19NlptsyY1OsG2v7jguFXIdcHeLnKTAx8UJ+cHz6heewN6SAGrdqw8b2GMVuvpbBhatIdmiP7d0+J5RUU+g5DO9eFGGiWV3ToR47VwCI48X4jxnhkslnXSnCesudsSh3mVIFWGRBIYL723SI=:K71/ib2SBQ6VfoGMhHDTkyPcg91uLZr+BqNFsVE0WFs="
}
def start_requests(self):
url = 'https://www.spitogatos.gr/'
yield scrapy.Request(
url=url,
method='GET',
cookies=self.cookies,
headers=self.headers,
callback= self.parse
)
def parse(self, response):
print(response.xpath('//h2[@class="text thin h1"]/text()').extract()) # your dummy xpath
Answered By - Shivam
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.