Issue
I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[@id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item
is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here
Solution
I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.
Answered By - J86
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.