Issue
I'm confused why Scrapy won't extract the link for the next page in the following code. I believe it may have something to do with the fact that every link has a URL of index.php
. Is it not working because I have to re-submit the original Request
body and headers with each subsequent request?
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
all_class_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://pisa.ucsc.edu',
'Accept-Language': 'en-us',
'Host': 'pisa.ucsc.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Referer': 'https://pisa.usc.edu/class_search/',
'Accept-Encoding': ['gzip', 'deflate', 'br'],
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
}
data = {
'action': 'results',
'binds[:term]': '2228',
'binds[:reg_status]': 'all',
'binds[:subject]': '',
'binds[:catalog_nbr_op]': '=',
'binds[:catalog_nbr]': '',
'binds[:title]': '',
'binds[:instr_name_op]': '=',
'binds[:instructor]': '',
'binds[:ge]': '',
'binds[:crse_units_op]': '=',
'binds[:crse_units_from]': '',
'binds[:crse_units_to]': '',
'binds[:crse_units_exact]': '',
'binds[:days]': '',
'binds[:times]': '',
'binds[:acad_career]': '',
'binds[:asynch]': 'A',
'binds[:hybrid]': 'H',
'binds[:synch]': 'S',
'binds[:person]': 'P',
}
page_2_form_data_additions = {'rec_start' : '0', 'rec_dur' : '25'}
def professor_filter(item):
return (re.search(r'\w\.', item) or "Staff" in item)
last_class_number = 0
classDict = {}
class ClassSpider(CrawlSpider):
name = "classes"
allowed_domains = ['pisa.ucsc.edu']
start_urls = ['https://pisa.ucsc.edu/class_search/index.php']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="row hide-print"]//a', restrict_text='next'), callback='parse_item', follow=True, cb_kwargs=data),
)
def print_link(self, response):
all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
for row in all_rows:
class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
print(class_name)
print("This activated")
def start_requests(self):
urls = ['https://pisa.ucsc.edu/class_search/index.php']
for url in urls:
yield scrapy.FormRequest(url,
headers=all_class_headers,
formdata=data,
callback=self.parse_item)
def parse_item(self, response):
#page = response.url.split("/")[-2]
all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
for row in all_rows:
class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
class_number = row.xpath('(.//div[@class="panel-body"]//div)[2]/a/text()').get().strip()
time = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[2]/text()').get().strip()
location = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[1]/text()').get().strip()
online_or_in_person = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-3 hide-print"])[3]/b/text()').get().strip()
classDict[class_number] = {'professor': professor, 'class_name':class_name, 'time': time, 'location': location , 'online_or_in_person': online_or_in_person}
return classDict
Solution
Is it not working because I have to re-submit the original Request body and headers with each subsequent request?
Answer: Yes, you can see that in devtools.
I think that scrapy.Spider
is more suitable for what you want to achieve.
import scrapy
class ClassSpider(scrapy.Spider):
name = "classes"
allowed_domains = ['pisa.ucsc.edu']
start_urls = ['https://pisa.ucsc.edu/class_search/index.php']
custom_settings = {'DOWNLOAD_DELAY': 0.4}
recNumber = 0
duration = 25
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Origin': 'https://pisa.ucsc.edu',
'Accept-Language': 'en-us',
'Host': 'pisa.ucsc.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Referer': 'https://pisa.usc.edu/class_search/',
'Accept-Encoding': ['gzip', 'deflate', 'br'],
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
}
payload = {
'action': 'results',
'binds[:term]': '2228',
'binds[:reg_status]': 'all',
'binds[:subject]': '',
'binds[:catalog_nbr_op]': '=',
'binds[:catalog_nbr]': '',
'binds[:title]': '',
'binds[:instr_name_op]': '=',
'binds[:instructor]': '',
'binds[:ge]': '',
'binds[:crse_units_op]': '=',
'binds[:crse_units_from]': '',
'binds[:crse_units_to]': '',
'binds[:crse_units_exact]': '',
'binds[:days]': '',
'binds[:times]': '',
'binds[:acad_career]': '',
'binds[:asynch]': 'A',
'binds[:hybrid]': 'H',
'binds[:synch]': 'S',
'binds[:person]': 'P',
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.FormRequest(url=url,
headers=self.headers,
formdata=self.payload,
callback=self.parse_item)
def parse_item(self, response):
all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
classDict = {}
for row in all_rows:
class_name = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
class_number = row.xpath('(.//div[@class="panel-body"]//div)[2]/a/text()').get().strip()
time = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[2]/text()').get().strip()
location = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-6"])[1]/text()').get().strip()
online_or_in_person = row.xpath('(.//div[@class="panel-body"]//div[@class="col-xs-6 col-sm-3 hide-print"])[3]/b/text()').get().strip()
classDict[class_number] = {'professor': professor, 'class_name':class_name, 'time': time, 'location': location , 'online_or_in_person': online_or_in_person}
# if no result then break
if not classDict:
return
yield classDict
# next page
self.payload['action'] = 'next'
self.recNumber += self.duration
self.payload['rec_start'] = str(self.recNumber)
self.payload['rec_dur'] = str(self.duration)
yield scrapy.FormRequest(url=response.url,
headers=self.headers,
formdata=self.payload,
callback=self.parse_item)
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.