Issue
This is my code, not sure what I am doing wrong here. Appreciate any help.
from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd
url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)
class WebSpider(scrapy.spider):
name = "Web_Spider"
allowed_domains = ['https://www.ufc.com/athletes']
start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']
def __init__(self):
self.driver = driver
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')
try:
next.click()
except:
break
self.driver.close()
I keep getting the error 'AttributeError: module 'scrapy' has no attribute 'spider''. Not sure what to do here, Scrapy is installed correctly and up-to-date.
Solution
Depending what you are trying to do, I wouldn't go with Selenium here as you can fetch the data directly through ajax. Selenium will still work but it's a bit overkill and less efficient.
Try this:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'}
page = 1
end_of_load = False
while end_of_load == False:
payload = {
'view_name': 'all_athletes',
'view_display_id': 'page',
'view_path': '/athletes/all',
'pager_element': '0',
'gender': 'All',
'page': '%s' %page}
jsonData = requests.post(url, headers=headers, data=payload).json()
print('Page: %s' %page)
page += 1
html = jsonData[-1]['data']
soup = BeautifulSoup(html, 'html.parser')
player_cards = soup.find_all('div',{'class':re.compile('.*view-mode-all-athletes-result.*')})
if not player_cards:
end_of_load = True
else:
for player_card in player_cards:
name = player_card.find('span',{'class':re.compile('.*athlete__name.*')}).text.strip()
try:
weight_class = player_card.find('div',{'class':re.compile('.*weight-class.*')}).text.strip()
except:
weight_class = 'N/A'
try:
record = player_card.find('span',{'class':re.compile('.*athlete__record.*')}).text.strip()
except:
record = 'N/A'
print('\t%s - %s\t%s' %(name,weight_class,record))
Answered By - chitown88
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.