Friday, October 22, 2021

[FIXED] Breakout of while loop while scrolling inside a div

October 22, 2021 loops, python, selenium, web-scraping No comments

Issue

Scraping Google Maps reviews. How can I break out of the while loop when it reaches the end of the reviews? Where am I making the mistake?

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv


driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
header_added = False
with open('urls.txt') as f:
    for url in f:
        driver.get(url) #line
        driver.maximize_window()
        # Page Title
        title = driver.title
        ftitle = title.split("-")
        title = ftitle[0]
        old_reviews = set()
        time.sleep(3)
        last_count = 0
        while True:
            scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
                (By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)  # Scroll
            reviews = WebDriverWait(driver, 5).until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
            ans = set(reviews) - set(old_reviews)  # Remove duplicate reviews
            new_count = len(reviews)
            # Reviews Div
            for div in ans:
                driver.execute_script("arguments[0].scrollIntoView();", div)
                name = div.get_attribute('aria-label')

                photo_url = div.find_element_by_tag_name('a').get_attribute('href')

                rating = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-H1e3jb"]').get_attribute(
                    'aria-label')

                try:
                    image_links = []
                    image_div = div.find_element_by_xpath('.//div[@class="ODSEW-ShBeI-Jz7rA"]')
                    images = image_div.find_elements_by_xpath('button')
                    for img in images:
                        im_link = img.value_of_css_property('background-image')
                        im_link = im_link[5:]
                        im_link = im_link[:-2]
                        image_links.append(im_link)

                except:
                    image_links = ''
                    pass
                try:
                    div.find_element_by_xpath('.//jsl//button').click()
                    time.sleep(1)
                except:
                    pass
                comment = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-text"]').text

                dict1 = {'Title': title, "Name": name, "Profile": photo_url, "Rating": rating,
                         "Comment": comment, "Images Posted": image_links}
                with open(f'Google_reviews_for_{title}.csv', 'a+', encoding='utf-8-sig', newline='') as f:
                    w = csv.DictWriter(f, dict1.keys())
                    if not header_added:
                        w.writeheader()
                        header_added = True
                    w.writerow(dict1)
                old_reviews = reviews
                if last_count == new_count:
                    break
                last_count = new_count

The URL:- https://www.google.com/maps/place/El+TabanKo/@42.848117,-2.6741402,19z/data=!4m15!1m7!3m6!1s0xd4fc26be313bc85:0xb10d327c782f87fa!2sCorrer%C3%ADa+Kalea,+45,+01001+Gasteiz,+Araba!3b1!8m2!3d42.8480012!4d-2.6737255!3m6!1s0xd4fc26be26c5be1:0x5f5e0ee05fe08041!8m2!3d42.8481171!4d-2.6735931!9m1!1b1

Solution

Instead of break try using return.
Since your break is inside a for loop it takes you out of the for loop, but you are still inside endless loop of while True:
To use return your code should be inside a method.
UPD
You don't update the new_count inside the internal for loop!!!

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
    
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
header_added = False
with open('urls.txt') as f:
    for url in f:
        driver.get(url) #line
        driver.maximize_window()
        # Page Title
        title = driver.title
        ftitle = title.split("-")
        title = ftitle[0]
        old_reviews = set()
        time.sleep(3)
        last_count = 0
        while True:
            scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
                    (By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)  # Scroll
            reviews = WebDriverWait(driver, 5).until(
                    EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
            ans = set(reviews) - set(old_reviews)  # Remove duplicate reviews
            new_count = len(reviews)
            # Reviews Div
            for div in ans:
                driver.execute_script("arguments[0].scrollIntoView();", div)
                name = div.get_attribute('aria-label')
    
                photo_url = div.find_element_by_tag_name('a').get_attribute('href')
    
                rating = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-H1e3jb"]').get_attribute(
                        'aria-label')
    
                try:
                    image_links = []
                    image_div = div.find_element_by_xpath('.//div[@class="ODSEW-ShBeI-Jz7rA"]')
                    images = image_div.find_elements_by_xpath('button')
                    for img in images:
                        im_link = img.value_of_css_property('background-image')
                        im_link = im_link[5:]
                        im_link = im_link[:-2]
                        image_links.append(im_link)
    
                except:
                    image_links = ''
                    pass
                try:
                    div.find_element_by_xpath('.//jsl//button').click()
                    time.sleep(1)
                except:
                    pass
                comment = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-text"]').text
    
                dict1 = {'Title': title, "Name": name, "Profile": photo_url, "Rating": rating,
                             "Comment": comment, "Images Posted": image_links}
                with open(f'Google_reviews_for_{title}.csv', 'a+', encoding='utf-8-sig', newline='') as f:
                    w = csv.DictWriter(f, dict1.keys())
                    if not header_added:
                        w.writeheader()
                        header_added = True
                    w.writerow(dict1)
            old_reviews = reviews
            if last_count == new_count:
                break
            last_count = new_count

Answered By - Prophet

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Friday, October 22, 2021

[FIXED] Breakout of while loop while scrolling inside a div

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels