Issue
I do not anything how to scrape ajax pages there is no pagination on website the website will be load by clicking the load more button
these is the page link https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas as pd
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
wait_time=3,
screenshot=True,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
books = response.xpath("//h3[@class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title = response.css(".mr3-m::text").get()
address = response.css(".showcase-address::text").get()
address=address.strip()
website = response.xpath("//li[@class='dib ml3 mr3']//a[starts-with(@href, 'http')]/@href").get()
website=website.strip()
phone = response.xpath("//li[@class='dib ml3 mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
phone=phone.strip().replace("-","")
yield{
'title':title,
'address':address,
'website':website,
'phone':phone
}
Solution
Okay, try the following script to get all the fields you wish to grab from there traversing all the exhibitor list:
import scrapy
from scrapy.selector import Selector
class MapYourShowSpider(scrapy.Spider):
name = "mapyourshow"
content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'
headers = {
'x-requested-with': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '557',
'start': '0',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.content_url,
method='GET',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
for item in response.json()['DATA']['results']['exhibitor']['hit']:
inner_link = self.inner_base.format(item['fields']['exhid_l'])
yield scrapy.Request(
url=inner_link,
headers=self.headers,
callback=self.parse_content,
)
def parse_content(self,response):
elem = response.json()['DATA']['BODYHTML']
sel = Selector(text=elem)
title = sel.css("h2::text").get()
try:
address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
except AttributeError: address = ""
website = sel.css("a[title*='website']::text").get()
phone = sel.xpath("normalize-space(//*[starts-with(@class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
yield {"title":title,"address":address,"website":website,"phone":phone}
Answered By - SIM
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.