Issue
Scraping Google Maps reviews. How can I break out of the while
loop when it reaches the end of the reviews? Where am I making the mistake?
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
header_added = False
with open('urls.txt') as f:
for url in f:
driver.get(url) #line
driver.maximize_window()
# Page Title
title = driver.title
ftitle = title.split("-")
title = ftitle[0]
old_reviews = set()
time.sleep(3)
last_count = 0
while True:
scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
(By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div) # Scroll
reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
ans = set(reviews) - set(old_reviews) # Remove duplicate reviews
new_count = len(reviews)
# Reviews Div
for div in ans:
driver.execute_script("arguments[0].scrollIntoView();", div)
name = div.get_attribute('aria-label')
photo_url = div.find_element_by_tag_name('a').get_attribute('href')
rating = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-H1e3jb"]').get_attribute(
'aria-label')
try:
image_links = []
image_div = div.find_element_by_xpath('.//div[@class="ODSEW-ShBeI-Jz7rA"]')
images = image_div.find_elements_by_xpath('button')
for img in images:
im_link = img.value_of_css_property('background-image')
im_link = im_link[5:]
im_link = im_link[:-2]
image_links.append(im_link)
except:
image_links = ''
pass
try:
div.find_element_by_xpath('.//jsl//button').click()
time.sleep(1)
except:
pass
comment = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-text"]').text
dict1 = {'Title': title, "Name": name, "Profile": photo_url, "Rating": rating,
"Comment": comment, "Images Posted": image_links}
with open(f'Google_reviews_for_{title}.csv', 'a+', encoding='utf-8-sig', newline='') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
old_reviews = reviews
if last_count == new_count:
break
last_count = new_count
Solution
Instead of break
try using return
.
Since your break
is inside a for
loop it takes you out of the for
loop, but you are still inside endless loop of while True:
To use return
your code should be inside a method.
UPD
You don't update the new_count
inside the internal for
loop!!!
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
header_added = False
with open('urls.txt') as f:
for url in f:
driver.get(url) #line
driver.maximize_window()
# Page Title
title = driver.title
ftitle = title.split("-")
title = ftitle[0]
old_reviews = set()
time.sleep(3)
last_count = 0
while True:
scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
(By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div) # Scroll
reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
ans = set(reviews) - set(old_reviews) # Remove duplicate reviews
new_count = len(reviews)
# Reviews Div
for div in ans:
driver.execute_script("arguments[0].scrollIntoView();", div)
name = div.get_attribute('aria-label')
photo_url = div.find_element_by_tag_name('a').get_attribute('href')
rating = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-H1e3jb"]').get_attribute(
'aria-label')
try:
image_links = []
image_div = div.find_element_by_xpath('.//div[@class="ODSEW-ShBeI-Jz7rA"]')
images = image_div.find_elements_by_xpath('button')
for img in images:
im_link = img.value_of_css_property('background-image')
im_link = im_link[5:]
im_link = im_link[:-2]
image_links.append(im_link)
except:
image_links = ''
pass
try:
div.find_element_by_xpath('.//jsl//button').click()
time.sleep(1)
except:
pass
comment = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-text"]').text
dict1 = {'Title': title, "Name": name, "Profile": photo_url, "Rating": rating,
"Comment": comment, "Images Posted": image_links}
with open(f'Google_reviews_for_{title}.csv', 'a+', encoding='utf-8-sig', newline='') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
old_reviews = reviews
if last_count == new_count:
break
last_count = new_count
Answered By - Prophet
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.