Tuesday, August 9, 2022

[FIXED] Why won't Scrapy go to the next page?

August 09, 2022 python, scrapy No comments

Issue

I'm confused why Scrapy won't extract the link for the next page in the following code. I believe it may have something to do with the fact that every link has a URL of index.php. Is it not working because I have to re-submit the original Request body and headers with each subsequent request?

import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

all_class_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Origin': 'https://pisa.ucsc.edu',
    'Accept-Language': 'en-us',
    'Host': 'pisa.ucsc.edu',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Referer': 'https://pisa.usc.edu/class_search/',
    'Accept-Encoding': ['gzip', 'deflate', 'br'],
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
}

data = {
    'action': 'results',
    'binds[:term]': '2228',
    'binds[:reg_status]': 'all',
    'binds[:subject]': '',
    'binds[:catalog_nbr_op]': '=',
    'binds[:catalog_nbr]': '',
    'binds[:title]': '',
    'binds[:instr_name_op]': '=',
    'binds[:instructor]': '',
    'binds[:ge]': '',
    'binds[:crse_units_op]': '=',
    'binds[:crse_units_from]': '',
    'binds[:crse_units_to]': '',
    'binds[:crse_units_exact]': '',
    'binds[:days]': '',
    'binds[:times]': '',
    'binds[:acad_career]': '',
    'binds[:asynch]': 'A',
    'binds[:hybrid]': 'H',
    'binds[:synch]': 'S',
    'binds[:person]': 'P',
}

page_2_form_data_additions = {'rec_start' : '0', 'rec_dur' : '25'}

def professor_filter(item):
    return (re.search(r'\w\.', item) or "Staff" in item)

last_class_number = 0

classDict = {}

class ClassSpider(CrawlSpider):
    
    name = "classes"

    allowed_domains = ['pisa.ucsc.edu']

    start_urls = ['https://pisa.ucsc.edu/class_search/index.php']

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//div[@class="row hide-print"]//a', restrict_text='next'), callback='parse_item', follow=True, cb_kwargs=data),
    )
    
    def print_link(self, response):
        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
        for row in all_rows:
            class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            print(class_name)
            print("This activated")

    
    def start_requests(self):
        urls = ['https://pisa.ucsc.edu/class_search/index.php']

        for url in urls:
            
            yield scrapy.FormRequest(url,
                                 headers=all_class_headers,
                                 formdata=data,
                                 callback=self.parse_item)
    
    
    def parse_item(self, response):

        #page = response.url.split("/")[-2]
        
        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
        
        for row in all_rows:
            class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
            class_number = row.xpath('(.//div[@class="panel-body"]//div)[2]/a/text()').get().strip()
            time = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[2]/text()').get().strip()
            location = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[1]/text()').get().strip()
            online_or_in_person = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-3 hide-print"])[3]/b/text()').get().strip()
            classDict[class_number] = {'professor': professor, 'class_name':class_name, 'time': time, 'location': location , 'online_or_in_person': online_or_in_person}
        
        return classDict

Solution

Is it not working because I have to re-submit the original Request body and headers with each subsequent request?

Answer: Yes, you can see that in devtools.

I think that scrapy.Spider is more suitable for what you want to achieve.

import scrapy


class ClassSpider(scrapy.Spider):
    name = "classes"
    allowed_domains = ['pisa.ucsc.edu']
    start_urls = ['https://pisa.ucsc.edu/class_search/index.php']
    custom_settings = {'DOWNLOAD_DELAY': 0.4}

    recNumber = 0
    duration = 25

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Origin': 'https://pisa.ucsc.edu',
        'Accept-Language': 'en-us',
        'Host': 'pisa.ucsc.edu',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'Referer': 'https://pisa.usc.edu/class_search/',
        'Accept-Encoding': ['gzip', 'deflate', 'br'],
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
    }

    payload = {
        'action': 'results',
        'binds[:term]': '2228',
        'binds[:reg_status]': 'all',
        'binds[:subject]': '',
        'binds[:catalog_nbr_op]': '=',
        'binds[:catalog_nbr]': '',
        'binds[:title]': '',
        'binds[:instr_name_op]': '=',
        'binds[:instructor]': '',
        'binds[:ge]': '',
        'binds[:crse_units_op]': '=',
        'binds[:crse_units_from]': '',
        'binds[:crse_units_to]': '',
        'binds[:crse_units_exact]': '',
        'binds[:days]': '',
        'binds[:times]': '',
        'binds[:acad_career]': '',
        'binds[:asynch]': 'A',
        'binds[:hybrid]': 'H',
        'binds[:synch]': 'S',
        'binds[:person]': 'P',
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url=url,
                                     headers=self.headers,
                                     formdata=self.payload,
                                     callback=self.parse_item)

    def parse_item(self, response):
        all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
        classDict = {}

        for row in all_rows:
            class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
            professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
            class_number = row.xpath('(.//div[@class="panel-body"]//div)[2]/a/text()').get().strip()
            time = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[2]/text()').get().strip()
            location = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[1]/text()').get().strip()
            online_or_in_person = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-3 hide-print"])[3]/b/text()').get().strip()
            classDict[class_number] = {'professor': professor, 'class_name':class_name, 'time': time, 'location': location , 'online_or_in_person': online_or_in_person}

        # if no result then break
        if not classDict:
            return

        yield classDict

        # next page
        self.payload['action'] = 'next'
        self.recNumber += self.duration
        self.payload['rec_start'] = str(self.recNumber)
        self.payload['rec_dur'] = str(self.duration)
        yield scrapy.FormRequest(url=response.url,
                                 headers=self.headers,
                                 formdata=self.payload,
                                 callback=self.parse_item)

Answered By - SuperUser

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Tuesday, August 9, 2022

[FIXED] Why won't Scrapy go to the next page?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels