Issue
i keep getting the "503 Service Unavailable" when i try and scrape the checkatrade website. I have tried putting concurrent requests to 1, download_delay to 10, having a user_agent but it just gets blocked at the first attempt of the start_url
the code below shows my attempt, i have also tried selenium (i have left the selenium code in comments below in the code - and even then it does not register the start_url):
import scrapy
from urllib.parse import urljoin
#from scrapy.http import TextResponse
#from selenium import webdriver
from checkatrade.items import CheckatradeItem
class checkatradeSpider(scrapy.Spider):
name = "checkatrade"
allowed_domains = ["checkatrade.com"]
start_urls = ["https://www.checkatrade.com/Directory/A"]
# def __init__(self):
# try:
# self.driver = webdriver.Chrome("C:/Users/andrew/Downloads/chromedriver_win32/chromedriver.exe")
# except:
# self.driver = webdriver.Chrome("C:/Users/andre/Downloads/chromedriver_win32/chromedriver.exe")
def parse(self, response):
# self.driver.get(response.url)
# response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
# for sel in response1.xpath('//*[@class="directory"]/tbody/tr'):
for sel in response.xpath('//*[@class="directory"]/tbody/tr'):
member = sel.xpath('normalize-space(.//td/a/text())').extract()
memberurl = sel.xpath('normalize-space(.//td/a/@href)').extract()
basedin = sel.xpath('normalize-space(.//td[2]/text())').extract()
memberfor = sel.xpath('normalize-space(.//td[3]/text())').extract()
reports = sel.xpath('normalize-space(.//td[4]/text())').extract()
rating = sel.xpath('normalize-space(.//td[5]/text())').extract()
item = CheckatradeItem()
item['member'] = member
item['memberurl'] = memberurl
item['basedin'] = basedin
item['memberfor'] = memberfor
item['reports'] = reports
item['rating'] = rating
yield item
# self.driver.close()
# try:
# self.driver = webdriver.Chrome("C:/Users/andrew/Downloads/chromedriver_win32/chromedriver.exe")
# except:
# self.driver = webdriver.Chrome("C:/Users/andre/Downloads/chromedriver_win32/chromedriver.exe")
Solution
But it's working without any issue.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
#options to add as arguments
from selenium.webdriver.chrome.options import Options
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('https://www.checkatrade.com/Directory/A')
time.sleep(2)
for sel in driver.find_elements(By.XPATH,'//*[@class="directory"]/tbody/tr'):
member = sel.find_element(By.XPATH,'.//td/a').text
print(member)
Output:
A B Smith & Sons (ASmithAndSons)
A - Klass Carpentry and Joinery Ltd (AklassCarpentryAndJoinery)
A Plaster (APlaster)
A - Z Asbestos Management Ltd (AZAsbestosManagement)
A - Z Property Services (AZPropertyServices1024583)
A - Z Repairs & Groundworks (AZRepairsGroundworks)
A & A Building Construction (AandAConstructionLondon)
A & A Brown Joiners Cabinet Makers (AABrownJoinersCabinetMakers)
A & A Builders (Surrey) Ltd (AABuildersLtd)
A & A Building Services (AAndABuildingServices)
A & A Cctv & Security Ltd (AACctvSecurityLtd)
A & A Concrete Repairs (AAConcreteRepairs)
A & A Decorator (AandADecorator)
A & A Domestics (AandADomestics)
A & A Double Glazing (AADoubleGlazing)
A & A Drain Services Ltd (AADrainServicesLtd)
A & A Electrical (AAElectrical978398)
A & A Electrics (AAElectrics)
A & A Fire Protection (AAFireProtection)
A & A Insulation Services Ltd (AandAInsulationServicesLtd)
A & A King Building Contractors (AAKingBuildingContractors)
A & A Lamb Ltd (AALambLtd)
A & A Landscape and Building (AAndALandscapeandBuilding)
A & A Landscaping & Paving (AALandscapingPaving)
... so on
Answered By - F.Hoque
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.