Wednesday, October 26, 2022

[FIXED] Python only iterates through 37 user profiles to scrape - why not the full amount of elements found by BS4?

October 26, 2022 beautifulsoup, python, selenium, web-scraping No comments

Issue

I've recently implemented a scroll function for an infinite page, for a class I'm working on here:

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchShadowRootException, NoSuchElementException
from selenium.webdriver.chrome.service import Service

class MixCloudDiscover:
    def __init__(self):
        self.driver = webdriver.Chrome(r'C:\Users\mok_z\OneDrive\Desktop\webdrivers\chromedriver.exe')
        chrome_options = Options()
        self.wait = WebDriverWait(self.driver, 20)
    
    def discover(self, terms):
        self.open_browser()
        mixcloud_data = []
        for term in terms:
            self.search(term)
            time.sleep(2)
            html = BeautifulSoup(self.driver.page_source, 'lxml')
            time.sleep(0.5)
            self.scroll()
            time.sleep(5)
            cards = html.find_all('div', class_='styles__UserCardInformation-sc-f909fw-5 jEfkYy')
            #print(cards)
            time.sleep(5)
            for card in cards:
                user_profile_url = self.open_profile(card)
                self.driver.get(user_profile_url)
                link = user_profile_url
                time.sleep(0.5)
                print('Link: ' + user_profile_url)
                time.sleep(0.5)
                name = self.profile_scrape()[0]
                followers = self.profile_scrape()[1]
                bio = self.profile_scrape()[2]
                location = self.profile_scrape()[3]
                twitter = self.profile_scrape()[4]
                mixcloud_dict = {'Link':link, 'Curator':name, 'Followers':followers, 'Bio':bio, 'Location':location, 'Twitter':twitter}
                mixcloud_data.append(mixcloud_dict)
        self.driver.close()
        return mixcloud_data                
        
    def open_browser(self):
        url = 'https://www.mixcloud.com'
        self.driver.get(url)
        time.sleep(2)
        self.driver.maximize_window()        

    def search(self, term):
        time.sleep(2)
        srch_click = self.driver.find_element(By.NAME, "mixcloud_query").click()
        time.sleep(0.5)
        srch_keys = self.driver.find_element(By.NAME, "mixcloud_query").send_keys(term) #<<<<<(self.search)

    def get_html(self):
        html = BeautifulSoup(self.driver.page_source, 'lxml')
        return html
    
    def scroll(self):
            music_div = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'body[class="rebrand"]')))
            music_div.click()
            #last_height = self.music_div.execute_script("return document.body.scrollHeight")
            while True:
                time.sleep(0.5)
                music_div.send_keys(Keys.END)
                print('scrolled to bottom')
                songs = self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[class^="SearchAudioCard__SearchAudioCardContainer"]')))
                print('songs:', len(songs))
                print('______________-')
                if len(songs) > 3000:
                    print('stopping')
                    break
                        
    def open_profile(self, card):
        user_link_suffix = card.div.span.a['href']
        user_profile_url = f'https://www.mixcloud.com{user_link_suffix}'
        #print('Name: ' + user)
        return user_profile_url

    def pull_twitter(self, profile_html):       
        twitter = profile_html.find_all('div', class_='social-links')
        try:
            for t in twitter:
                t = twitter.a['href']
                #twitters.append(t)
                #print("Twitter: " + t)
                return t
        except:
            pass
    
    def pull_location(self, profile_html):
        location = profile_html.find('p', class_='profile-location')
        try:
            hq_white = location.text
            hq = str.strip(hq_white)
            #print('Location: ' + hq)
            return hq
        except:
            pass
    
    def pull_bio(self, profile_html):
        bio = profile_html.find('div', class_='profile-bio')
        try:
            bio_white = bio.text
            bio_strip = str.strip(bio_white)
            #print("Bio: " + bio_strip)
            return bio_strip
        except:
            pass

    def pull_followers(self, profile_html):
        header = profile_html.find('div', class_='cf')
        try: 
            for h in header:
                followers = h.h2.a.text
                #print("Followers: " + followers)
                return followers
        except:
            print("Can't pull followers")
    
    def profile_scrape(self):
        profile_html = BeautifulSoup(self.driver.page_source, 'lxml')
        time.sleep(0.5)
        #genres = []
        name = self.pull_name(profile_html)
        followers = self.pull_followers(profile_html)
        bio = self.pull_bio(profile_html)
        location = self.pull_location(profile_html)
        twitter = self.pull_twitter(profile_html)
        #genres.append(self.pull_genres(profile_html))
        return name, followers, bio, location, twitter
    
    #def pull_genre(self, profile_html):

    def pull_name(self, profile_html):
        name_element = profile_html.find('div', class_='profile-username')
        try:
            name = name_element.h1.text
            return name
            #print("Bio: " + bio_strip)
        except:
            pass

see 'scroll' function in particular. When ran through this here, I used to get a couple of hundred rows of data. However, now, I only get a MAX of 37.

from datetime import datetime
import csv
import pandas as pd
import MixcloudScraperDiscoverFunc as search

terms = ['house']
term_string = str(terms)
print(term_string)
discover = search.MixCloudDiscover()
discover_data = discover.discover(terms)
now = datetime.now()
fields = ['MixCloudLink', 'MixCloudName', 'MixCloudReach', 'Bio', 'Location', 'Twitter']
currentDateTime = datetime.now().strftime("%m-%d-%Y %H-%M-%S %p")
mixcloud_df = pd.DataFrame.from_dict(discover_data)
print(mixcloud_df)
mixcloud_df.to_csv(f"Mixcloud_Data_{str(terms)}_{currentDateTime}.csv", index = False)

How can I ensure the code iterates through all of the 'cards' (user profiles) that can be found on the page provided, after the page has been scrolled? Is there something I'm missing here?

Thanks in advance.

Solution

I increased the sleep time immediately after self.scroll() to 20 seconds or more, for the page to load. The longer I waited, the more results I got in the end.

Answered By - Duc Nguyen

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Wednesday, October 26, 2022

[FIXED] Python only iterates through 37 user profiles to scrape - why not the full amount of elements found by BS4?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels