Issue
The Situation:
I recently started web scraping using selenium and scrapy and i was working on a project where i have a csv file which contains 42 thousand zip codes and my job is to take that zip code and go on this site input the zip code and scrape all the results.
The Problem:
The problem here is that in doing this I have to continuously click the 'load more' button until all the results have been displayed and only once that has finished I can collect the data.
This may not be much of an issue, however it takes 2 minutes to do this per zip code and I have 42 000 to do this with.
The Code:
import scrapy
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from items import CareCreditItem
from datetime import datetime
import os
from scrapy.crawler import CrawlerProcess
global pin_code
pin_code = input("enter pin code")
class CareCredit1Spider(scrapy.Spider):
name = 'care_credit_1'
start_urls = ['https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1']
def start_requests(self):
directory = os.getcwd()
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-logging"])
path = (directory+r"\\Chromedriver.exe")
driver = webdriver.Chrome(path,options=options)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty/" +pin_code + "/?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(200)
try:
cookies = driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]')
cookies.click()
except:
pass
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
load_more = driver.find_element_by_xpath('//*[@id="next-page"]')
load_more.click()
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
except ElementClickInterceptedException:
pass
except StaleElementReferenceException:
pass
except NoSuchElementException:
loadMoreButtonExists = False
try:
previous_page = driver.find_element_by_xpath('//*[@id="previous-page"]')
previous_page.click()
except:
pass
name = driver.find_elements_by_class_name('dl-result-item')
r = 1
temp_list=[]
j = 0
for element in name:
link = element.find_element_by_tag_name('a')
c = link.get_property('href')
yield scrapy.Request(c)
def parse(self, response):
item = CareCreditItem()
item['Practise_name'] = response.css('h1 ::text').get()
item['address'] = response.css('.google-maps-external ::text').get()
item['phone_no'] = response.css('.dl-detail-phone ::text').get()
yield item
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")
dt = now.strftime("%H-%M-%S")
file_name = dt_string+"_"+dt+"zip-code"+pin_code+".csv"
process = CrawlerProcess(settings={
'FEED_URI' : file_name,
'FEED_FORMAT':'csv'
})
process.crawl(CareCredit1Spider)
process.start()
print("CSV File is Ready")
items.py
import scrapy
class CareCreditItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Practise_name = scrapy.Field()
address = scrapy.Field()
phone_no = scrapy.Field()
The Question:
Essentially my question is simple. Is there a way to optimize this code in order for it to perform faster? Or what are the other potential methods in order to handle scraping this data without it taking forever?
Solution
Since the site loads the data dynamically from an api you can retrieve the data directly from the api. This will speed things up quite a bit, but I'd still implement a wait to avoid hitting the rate limit.
import requests
import time
import pandas as pd
zipcode = '00704'
radius = 75
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page=1'
req = requests.get(url)
r = req.json()
data = r['results']
for i in range(2,r['maxPage']+1):
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page={i}'
req = requests.get(url)
r = req.json()
data.extend(r['results'])
time.sleep(1)
df = pd.DataFrame(data)
df.to_csv(f'{pd.Timestamp.now().strftime("%d/%m/%Y_%H-%M-%S")}zip-code{zipcode}.csv')
Answered By - RJ Adriaansen
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.