Issue
I am trying to extract the class abbreviation (Econ 114) and name (Adv Quant Methods) from strings similar to ECON 114 - 01 Adv Quant Methods
in python.
I am using the expression r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)'
which works in my regex tester. However, when I run this in scrapy the return array is empty. What am I doing wrong? (code below)
import scrapy;
import re as pythonRe;
#with open('../econ.html', 'r') as f:
#html_string = f.read()
econ_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://pisa.ucsc.edu',
'Accept-Language': 'en-us',
'Host': 'pisa.ucsc.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Referer': 'https://pisa.usc.edu/class_search/',
'Accept-Encoding': ['gzip', 'deflate', 'br'],
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded'}
class ClassesSpider(scrapy.Spider):
name = "classes"
def start_requests(self):
urls = [
'https://pisa.usc.edu/class_search/index.php'
]
for url in urls:
yield scrapy.Request(url=url, method="POST", headers=econ_headers, body='action=results&binds%5B%3Aterm%5D=2228&binds%5B%3Areg_status%5D=all&binds%5B%3Asubject%5D=ECON&binds%5B%3Acatalog_nbr_op%5D=%3D&binds%5B%3Acatalog_nbr%5D=&binds%5B%3Atitle%5D=&binds%5B%3Ainstr_name_op%5D=%3D&binds%5B%3Ainstructor%5D=&binds%5B%3Age%5D=&binds%5B%3Acrse_units_op%5D=%3D&binds%5B%3Acrse_units_from%5D=&binds%5B%3Acrse_units_to%5D=&binds%5B%3Acrse_units_exact%5D=&binds%5B%3Adays%5D=&binds%5B%3Atimes%5D=&binds%5B%3Aacad_career%5D=&binds%5B%3Aasynch%5D=A&binds%5B%3Ahybrid%5D=H&binds%5B%3Asynch%5D=S&binds%5B%3Aperson%5D=P', callback=self.parse)
def parse(self, response):
def professor_filter(item):
if (pythonRe.search(r'\w\.', item) or "Staff" in item):
return True
#class_regex = pythonRe.compile(r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)')
page = response.url.split("/")[-2]
classDict = {}
classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+[ ]+([\w\s]+\b)')
professors = response.xpath('//div[contains(@class, "col-xs-6 col-sm-3")]/text()').getall()
professors_filtered = list(filter(professor_filter, professors))
#for x in range((len(classes))):
#classDict[classes[x]] = {'professor': professors_filtered[x]}
print(classes)
print(len(classes))
print(professors_filtered)
print(len(professors_filtered))
print(professors)
print(classDict)
filename = f'class-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
Solution
If you would first get full text for classes
and display it
then you would see that scrapy
gives \xa0
instead of
And you have to use \xa0+
instead of [ ]+
classes = response.xpath('//a[contains(@id, "class_id")]/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+[\xa0]+([\w\s]+\b)')
and this gives me:
classes: ['ECON 1', 'Intro Microeconomic', 'ECON 1', 'Intro Microeconomic', 'ECON 2', 'Intro Macroeconomic', 'ECON 10A', 'Econ of Accounting', 'ECON 10A', 'Econ of Accounting', 'ECON 11A', 'Math Methd for Econ', 'ECON 11B', 'Math Methds Econ II', 'ECON 100A', 'Intermed Microecon', 'ECON 100A', 'Intermed Microecon', 'ECON 100B', 'Intermed Macroecon', 'ECON 101', 'Managerial Econ', 'ECON 104', 'Numbr Truth', 'ECON 111A', 'Intermed Account I', 'ECON 113', 'Intro Econometrics', 'ECON 113', 'Intro Econometrics', 'ECON 114', 'Adv Quant Methods', 'ECON 117B', 'Tax Factors', 'ECON 125', 'Econ History Of US', 'ECON 126', 'Why Succeed', 'ECON 133', 'Security Markets', 'ECON 136', 'Business Strategy', 'ECON 141', 'Internatl Finance', 'ECON 150', 'Public Finance', 'ECON 161A', 'Marketing', 'ECON 166A', 'Game Theory']
I think problem is because response.body
gives original string with HTML but other functions may have to convert this string to HTML Tree
(like in modules lxml
or BeautifulSoup
) and it may automatically convert html entities
(like
) to chars.
As I know scrapy
uses parsel to select elements in HTML.
See Scrapy doc: Selectors
EDIT:
Full working code with other changes
- I use FormRequest
- first I search rows in table and later search class and professor in every row separatelly.
import scrapy
import re
econ_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://pisa.ucsc.edu',
'Accept-Language': 'en-us',
'Host': 'pisa.ucsc.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Referer': 'https://pisa.usc.edu/class_search/',
'Accept-Encoding': ['gzip', 'deflate', 'br'],
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
}
data = {
'action': 'results',
'binds[:term]': '2228',
'binds[:reg_status]': 'all',
'binds[:subject]': 'ECON',
'binds[:catalog_nbr_op]': '=',
'binds[:catalog_nbr]': '',
'binds[:title]': '',
'binds[:instr_name_op]': '=',
'binds[:instructor]': '',
'binds[:ge]': '',
'binds[:crse_units_op]': '=',
'binds[:crse_units_from]': '',
'binds[:crse_units_to]': '',
'binds[:crse_units_exact]': '',
'binds[:days]': '',
'binds[:times]': '',
'binds[:acad_career]': '',
'binds[:asynch]': 'A',
'binds[:hybrid]': 'H',
'binds[:synch]': 'S',
'binds[:person]': 'P',
}
def professor_filter(item):
return (re.search(r'\w\.', item) or "Staff" in item)
class ClassesSpider(scrapy.Spider):
name = "classes"
def start_requests(self):
urls = ['https://pisa.ucsc.edu/class_search/index.php']
for url in urls:
#yield scrapy.Request(url,
# headers=econ_headers,
# body='action=results&binds%5B%3Aterm%5D=2228&binds%5B%3Areg_status%5D=all&binds%5B%3Asubject%5D=ECON&binds%5B%3Acatalog_nbr_op%5D=%3D&binds%5B%3Acatalog_nbr%5D=&binds%5B%3Atitle%5D=&binds%5B%3Ainstr_name_op%5D=%3D&binds%5B%3Ainstructor%5D=&binds%5B%3Age%5D=&binds%5B%3Acrse_units_op%5D=%3D&binds%5B%3Acrse_units_from%5D=&binds%5B%3Acrse_units_to%5D=&binds%5B%3Acrse_units_exact%5D=&binds%5B%3Adays%5D=&binds%5B%3Atimes%5D=&binds%5B%3Aacad_career%5D=&binds%5B%3Aasynch%5D=A&binds%5B%3Ahybrid%5D=H&binds%5B%3Asynch%5D=S&binds%5B%3Aperson%5D=P',
# callback=self.parse)
yield scrapy.FormRequest(url,
headers=econ_headers,
formdata=data,
callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
all_rows = response.xpath('//div[contains(@id, "rowpanel_")]')
classDict = {}
for row in all_rows:
classname = row.xpath('.//h2//a/text()').re(r'(?i)(\w+\s\w+)+\s-\s\w+\xa0+([\w\s]+\b)')
professor = row.xpath('(.//div[@class="panel-body"]//div)[3]/text()').get().strip()
print(classname, professor)
if professor and professor_filter(professor):
classDict[tuple(classname)] = [professor]
yield {'class': tuple(classname), 'professor': professor} # it will write to file csv
else:
print('skip:', professor)
print(classDict)
#filename = f'class-{page}.html'
#with open(filename, 'wb') as f:
# f.write(response.body)
#self.log(f'Saved file {filename}')
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
#'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ClassesSpider)
c.start()
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.