Issue
I am trying to run some code but I am getting the error message AttributeError: 'NoneType' object has no attribute 'find_all' in the looping part of the pages. I think that JS detecting that I'm using ChromeDriver and blocks the request to the web page.
Suggestions for what to do?
cards = []
pages = 5
for i in range(pages):
url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
for anuncio in anuncios:
card = {}
card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
card['location'] = get_text(anuncio.find('p', class_="detail-region"))
card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
card['link'] = get_link(anuncio.find('a', href=True))
if len(card['value']):
cards.append(card)
dataset = pd.DataFrame(cards)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_11539/2840841130.py in <module>
7
8 soup = BeautifulSoup(driver.page_source, 'html.parser')
----> 9 anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
10
11 for anuncio in anuncios:
AttributeError: 'NoneType' object has no attribute 'find_all'
COMPLETE CODE
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
def get_text(bs_tag):
if bs_tag:
return bs_tag.get_text().strip().replace('\n', '').replace('\t', '')
else:
return ''
def get_link(bs_tag):
if bs_tag:
return bs_tag['href']
else:
return ''
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chromedriver, options=options)
driver.implicitly_wait(5)
cards = []
pages = 5
for i in range(pages):
url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
for anuncio in anuncios:
card = {}
card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
card['location'] = get_text(anuncio.find('p', class_="detail-region"))
card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
card['link'] = get_link(anuncio.find('a', href=True))
if len(card['value']):
cards.append(card)
dataset = pd.DataFrame(cards)
Solution
You don't have to use Selenium for this, all the data is stored in a script tag which can easily be scraped like this:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
final = []
for page in range(1,5):
url = f'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o={page}'
landing_page = requests.get(url,headers=headers)
print(f'Scraping page {page}')
soup = BeautifulSoup(landing_page.text,'html.parser')
dirty = soup.find('script',{'id':'initial-data'})['data-json']
clean = json.loads(dirty.replace('"','"'))
data = clean['listingProps']['adList']
for listing in data:
try:
listing.pop('images') #clean up csv
listing.pop('properties')
except:
continue #some listings don't have images/properties
final.append(listing)
df = pd.DataFrame(final)
df.to_csv('output.csv',index=False)
Answered By - bushcat69
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.