Issue
How would I make selenium wait for scrapy to scrape the information needed from the first page, and only then click the next page button and then scrape the next page. Ultimately I am trying to repeat this process until the last page is reached which is page 301.
# -*- coding: utf-8 -*-
from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which
#login info
username = "xxx"
password = "xxx"
class HtSpiderSelenium(scrapy.Spider):
name = 'ht_selenium1'
allowed_domains = ['https://app.xxx.bootstart.tech']
start_urls = ['https://app.xxx.bootstart.tech']
def __init__(self):
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path="./chromedriver")
#get login page
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https%3A%2F%2Fapp.xxx.bootstart.tech%2F%3Fredirect_fragment%3D%252Fstartup&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")
#login
driver.find_element_by_id("username").send_keys(username)
driver.find_element_by_id("password").send_keys(password)
driver.find_element_by_name("login").click()
sleep(15)
#next page button
driver.find_element_by_xpath("/html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a").click()
sleep(10)
self.html = driver.page_source
driver.close()
#scrape needed info
def parse(self, response):
resp = Selector(text=self.html)
for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
yield {
'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
'startup descript': ''.join(startup.xpath('//div//p//div//text()').getall()),
'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
}
Solution
You can try to do something like this:
# -*- coding: utf-8 -*-
# from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which
#login info
username = "xxx"
password = "xxx"
class HtSpiderSelenium(scrapy.Spider):
name = 'ht_selenium1'
allowed_domains = ['app.xxx.bootstart.tech']
def __init__(self):
chrome_options = Options()
#chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(executable_path="./chromedriver")
#get login page
self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
self.driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https%3A%2F%2Fapp.xxx.bootstart.tech%2F%3Fredirect_fragment%3D%252Fstartup&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")
#login
self.driver.find_element_by_id("username").send_keys(username)
self.driver.find_element_by_id("password").send_keys(password)
self.driver.find_element_by_name("login").click()
self.start_urls = [self.driver.current_url]
#scrape needed info
def parse(self, response):
self.driver.get(response.url)
while True:
resp = Selector(text=self.driver.page_source)
for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
yield {
'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
'startup description': ''.join(startup.xpath('//div//p//div//text()').getall()),
'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
}
#next page button
next_page = self.driver.find_element_by_xpath("/html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a")
try:
next_page.click()
except:
break
self.driver.close()
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.