Issue
I'm trying to scrape replies to public Tweets using Python.
I have the code below, which gets all replies displayed on the screen, but I am having trouble getting the rest of the replies that need scrolling.
The code works fine without the scroll loop, but once it is implemented, it just retrieves blank results.
Can someone please help me figure out why?
Tweet to be used as an example: https://twitter.com/BBCWorld/status/1535676092450840578
Code with scrolling loop:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
driver = webdriver.Chrome(executable_path=r"C:\User\AppData\Local\SeleniumBasic\chromedriver") #find chrome drive in pc folder
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578") #URL used as example
time.sleep(60)
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
tweets = []
while True:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
tweets.append([date, replying_to, text])
time.sleep(3)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(3)
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:\User\Downloads\Tweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)
---UPDATE---
Based on the suggestion below, I've updated the code as follows, but I am only getting the first replies (i.e., the ones after scrolling are still missing):
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd
import time
driver = webdriver.Chrome(executable_path=r"C:\Users\AppData\Local\SeleniumBasic\chromedriver")
driver.get("https://twitter.com/BBCWorld/status/1535676092450840578")
time.sleep(60)
tweets = []
result = False
old_height = driver.execute_script("return document.body.scrollHeight")
#set initial all_tweets to start loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
while result == False:
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([date, replying_to, text])
#scroll down the page
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == old_height:
result = True
old_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv(r'C:\Users\Downloads\Tweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder
print(df)
Solution
I am happy to share that I finally found a solution to the above query! It's not perfect (as it doesn't load hidden replies and only scrappes the main reply, i.e, doesn't consider the sub-replies), but it was enough for my current needs.
So, fell free to use it, but keep these limitations in mind :)
#Do imports
import numpy as np
import pandas as pd
import time
import selenium
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# Set driver and initial array
driver = webdriver.Chrome(executable_path=r"C:\Users\your_user\AppData\Local\SeleniumBasic\chromedriver") #change parameters to your user and folder structure
driver.get("the url you want to scrappe") #input the url you wanna scrappe here
time.sleep(10) #change according to your pc and internet connection
tweets = []
result = False
# Get scroll height after first time page load
last_height = driver.execute_script("return document.body.scrollHeight")
last_elem=''
current_elem=''
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(6)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#update all_tweets to keep loop
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]')
for item in all_tweets[1:]: # skip tweet already scrapped
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
#Append new tweets replies to tweet array
tweets.append([username, replying_to, text, date])
if (last_elem == current_elem):
result = True
else:
last_elem = current_elem
df = pd.DataFrame(tweets, columns=['Replying to', 'Tweet', 'Date of Tweet'])
df.to_csv(r'C:\Users\your_user\Downloads\Tweets.csv', index=False, encoding='utf-8') #save a csv file in the downloads folder, change it to your structure and desired folder
print(df)
Answered By - M_B
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.