Tuesday, June 28, 2022

[FIXED] Scrape Tweet replies with Python and Selenium

June 28, 2022 pandas, python, selenium, twitter, web-scraping No comments

Issue

I'm trying to scrape replies to public Tweets using Python.

I have the code below, which gets all replies displayed on the screen, but I am having trouble getting the rest of the replies that need scrolling.

The code works fine without the scroll loop, but once it is implemented, it just retrieves blank results.

Can someone please help me figure out why?

Tweet to be used as an example: https://twitter.com/BBCWorld/status/1535676092450840578

Code with scrolling loop:

import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np

import pandas as pd
import time

driver = webdriver.Chrome(executable_path=r"C:\User\AppData\Local\SeleniumBasic\chromedriver") #find chrome drive in pc folder

driver.get("https://twitter.com/BBCWorld/status/1535676092450840578") #URL used as example

time.sleep(60)
    
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')

tweets = []

while True:
    
    for item in all_tweets[1:]: # skip tweet already scrapped

        print('--- date ---')
        try:
            date = item.find_element(By.XPATH, './/time').text
        except:
            date = '[empty]'
        print(date)
    
        print('--- text ---')
        try:
            text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
        except:
            text = '[empty]'
        print(text)

        print('--- replying_to ---')

        try:
            replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
        except:
            replying_to = '[empty]'
        print(replying_to)

        tweets.append([date, replying_to, text])
        time.sleep(3)
    
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(3)
   
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:\User\Downloads\Tweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)

---UPDATE---

Based on the suggestion below, I've updated the code as follows, but I am only getting the first replies (i.e., the ones after scrolling are still missing):

import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np

import pandas as pd
import time

driver = webdriver.Chrome(executable_path=r"C:\Users\AppData\Local\SeleniumBasic\chromedriver")

driver.get("https://twitter.com/BBCWorld/status/1535676092450840578")

time.sleep(60)
    
tweets = []
result = False
old_height = driver.execute_script("return document.body.scrollHeight")

#set initial all_tweets to start loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')

while result == False:

    for item in all_tweets[1:]: # skip tweet already scrapped

        print('--- date ---')
        try:
            date = item.find_element(By.XPATH, './/time').text
        except:
            date = '[empty]'
        print(date)

        print('--- text ---')
        try:
            text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
        except:
            text = '[empty]'
        print(text)

        print('--- replying_to ---')
        try:
            replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
        except:
            replying_to = '[empty]'
        print(replying_to)

    
        #Append new tweets replies to tweet array
        tweets.append([date, replying_to, text])
    
    #scroll down the page
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    if new_height == old_height:
        result = True
    old_height = new_height
    
    #update all_tweets to keep loop
    all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')


df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:\Users\Downloads\Tweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)

Solution

I am happy to share that I finally found a solution to the above query! It's not perfect (as it doesn't load hidden replies and only scrappes the main reply, i.e, doesn't consider the sub-replies), but it was enough for my current needs.

So, fell free to use it, but keep these limitations in mind :)

#Do imports
import numpy as np
import pandas as pd
import time
import selenium
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

# Set driver and initial array 
driver = webdriver.Chrome(executable_path=r"C:\Users\your_user\AppData\Local\SeleniumBasic\chromedriver") #change parameters to your user and folder structure

driver.get("the url you want to scrappe") #input the url you wanna scrappe here

time.sleep(10) #change according to your pc and internet connection
    
tweets = []
result = False
    
# Get scroll height after first time page load
last_height = driver.execute_script("return document.body.scrollHeight")

last_elem=''
current_elem=''

while True:
    
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load page
    time.sleep(6)
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
    
    #update all_tweets to keep loop
    all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')

    for item in all_tweets[1:]: # skip tweet already scrapped

        print('--- date ---')
        try:
            date = item.find_element(By.XPATH, './/time').text
        except:
            date = '[empty]'
        print(date)

        print('--- text ---')
        try:
            text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
        except:
            text = '[empty]'
        print(text)

        print('--- replying_to ---')
        try:
            replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
        except:
            replying_to = '[empty]'
        print(replying_to)
    
        #Append new tweets replies to tweet array
        tweets.append([username, replying_to, text, date])
               
        if (last_elem == current_elem):
            result = True
        else:
            last_elem = current_elem


df = pd.DataFrame(tweets, columns=['Replying to', 'Tweet', 'Date of Tweet'])
df.to_csv(r'C:\Users\your_user\Downloads\Tweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder, change it to your structure and desired folder

print(df)

Answered By - M_B

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Tuesday, June 28, 2022

[FIXED] Scrape Tweet replies with Python and Selenium

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels