Issue
the next_page variable gives the correct link when used on shell and even when printed on Console but Scrapy still keeps scraping the same(first) page
code below:
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%255B%255D%3DMen&p[]=facets.ideal_for%255B%255D%3Dmen&otracker=categorytree&fm=neo%2Fmerchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%2527s%2BBottom%2BWear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
browser=webdriver.Chrome()
browser.get(response.request.url)
next_page=response.css("a._1LKTO3::attr(href)").getall()
try:
next_page=next_page[-1]
except:
time.sleep(1)
next_page=response.css("a._1LKTO3::attr(href)").getall()
next_page=next_page[-1]
print("\n\n\n NEXT PAGE\n\n\n")
print("\n"+next_page+"\n")
print(response.urljoin(next_page))
if next_page is not None:
next_page=response.urljoin(next_page)
# yield scrapy.Request(url=next_page,callback=self.parse)
yield scrapy.Request(next_page, callback=self.parse)
Solution
Your code works for me so I'm not sure why it doesn't work for you. Anyway this pagination also works but it's cleaner.
import scrapy
from selenium import webdriver
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%255B%255D%3DMen&p[]=facets.ideal_for%255B%255D%3Dmen&otracker=categorytree&fm=neo%2Fmerchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%2527s%2BBottom%2BWear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
browser = webdriver.Chrome()
browser.get(response.request.url)
next_page = response.xpath('//a[span[text()="Next"]]/@href').get()
if next_page:
print("\n\n\n NEXT PAGE\n\n\n")
print("\n"+next_page+"\n")
next_page = response.urljoin(next_page)
print(next_page)
yield scrapy.Request(next_page, callback=self.parse)
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.