Issue
I'm trying to create a websraper that will return restaurant names and addresses from the website. In the current version, it returns only names (as a test), but they are saved in the form of a string ([{'name': 'Copernicus Restaurant | Copernicus Hotel'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}, {'name': 'Cyrano de Bergerac'}]
).
Could someone help me to correct this code so that it would take links to each restaurant and then extract data about the name of the restaurant, address from those links?
I will be grateful for any help.
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(productlinks)
restlist = []
for link in productlinks:
r = driver.get(link)
soup = BeautifulSoup(driver.page_source, 'lxml')
name = soup.find('h1', class_='notranslate').text.strip()
# address = soup.find('div', class_='address')
# try:
# website = soup.find('a', href=True)
# except:
# website = 'NULL'
rest ={
'name': name,
# 'website': website,
# 'address': address
}
restlist.append(rest)
print(restlist)
driver.quit()
Edited code with wrong result:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
driver = webdriver.Chrome()
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Gostynin-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element(By.XPATH, '//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element(By.XPATH, '//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element(By.XPATH, '//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
website = ''
rest = {
'name': name,
'website': website,
'address': address,
}
restlist.append(rest)
print(restlist)
#df = pd.DataFrame(restlist)
#df.to_csv('C:/webdrivers/restauracje.csv')
#print(df.head(10))
driver.quit()
Solution
There are many a
with href
so you have to use more complex method to get website
.
website
is in <div class="website">
so you can do
website = soup.find('div', class_='website').find('a').get('href')
but real link to restaurant is as text, not href
website = soup.find('div', class_='website').find('a').text
As for address I also had to add extra .find('div', class_=False)
(and .text.strip()
) to get it
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
Selenium has own methods to search elements in HTML and maybe it would run faster.
name = driver.find_element_by_xpath('//h1[@class="notranslate"]').text.strip()
address = driver.find_element_by_xpath('//div[@class="address"]/div[2]').text.strip()
website = driver.find_element_by_xpath('//div[@class="website"]//a').text #get('href')
Tested with Firefox on Linux:
In code I keep both methods: soup.find
and driver.find_element_by_xpath
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
try:
driver = webdriver.Firefox()
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
print('[DEBUG] productlist ...')
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
print('len(productlinks):', len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element_by_xpath('//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element_by_xpath('//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element_by_xpath('//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
print('[DEBUG] Exception:', ex)
website = ''
print(website)
rest = {
'name': name,
'website': website,
'address': address,
}
print('[DEBUG] rest ...')
print(rest)
print('-----')
restlist.append(rest)
# --- after `for`-loop ---
print(restlist)
except KeyboardInterrupt:
print("KeyboardInterrupt")
finally:
driver.quit()
# open only once
with open('output.csv', 'w') as f:
csv_writer = csv.DictWriter(f, fieldnames=['name', 'website', 'address'])
csv_writer.writeheader()
csv_writer.writerows(restlist)
Result (from print(restlist)
)
[
{'name': 'Copernicus Restaurant | Copernicus Hotel', 'website': 'https://www.likusrestauracje.pl/', 'address': 'Kanonicza 16, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Farina Restaurant', 'website': 'https://www.farina.com.pl/', 'address': 'Świętego Marka 16, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Cyrano de Bergerac', 'website': 'http://cyranodebergerac.com.pl', 'address': 'Sławkowska 26, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Amarylis Restaurant', 'website': 'https://www.queenhotel.pl/', 'address': 'Józefa Dietla 60, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Projekt Nano', 'website': '', 'address': 'Podmurna 17 A, Torun, Kuyavian-Pomeranian Voivodeship, Poland'},
{'name': 'Raffles Europejski Warsaw', 'website': 'https://www.raffles.com/warsaw', 'address': 'Nowy Świat-Uniwersytet'},
{'name': 'Caffe Horst', 'website': 'http://www.caffehorst.pl/', 'address': 'Świętochłowicka 6, Bytom, Silesian Voivodeship, Poland'},
{'name': 'Proza', 'website': '', 'address': 'Jana Karola Chodkiewicza 7, Rzeszow, Podkarpackie Voivodeship, Poland'},
{'name': 'Il Posto di Luca Santarossa', 'website': 'http://www.ilposto.pl', 'address': 'Jana Sawy 5/lokal 10, Lublin, Lublin Voivodeship, Poland'},
{'name': 'Balkan Bistro Prespa', 'website': '', 'address': 'Władysława Syrokomli 8, Warsaw, Masovian Voivodeship, Poland'},
{'name': 'Mr Coffee', 'website': '', 'address': 'Tumska 4, Klodzko, Lower Silesian Voivodeship, Poland'},
{'name': 'Bottiglieria 1881 Restaurant', 'website': 'https://www.1881.com.pl/', 'address': 'Bocheńska 5, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Albertina Restaurant & Wine', 'website': 'https://www.albertinarestaurant.pl/', 'address': 'Dominikańska 3, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Pub & Restauracja „W Sercu Łodzi”', 'website': '', 'address': 'al. Marszałka Józefa Piłsudskiego 138, Łódź, Łódź Voivodeship, Poland'},
{'name': '#Alternatywnie', 'website': 'http://www.altcoffee.pl/', 'address': 'aleja Wojska Polskiego 35/u3, Szczecin, West Pomeranian Voivodeship, Poland'},
{'name': 'Aqua e Vino', 'website': 'http://www.aquaevino.pl', 'address': 'Wiślna 5/10, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Pili Pili Gdańsk', 'website': 'http://www.pilipilicafe.com/', 'address': 'Szafarnia 11/U14, Gdańsk, Pomeranian Voivodeship, Poland'},
{'name': 'Kawiarnia Coffeinna', 'website': '', 'address': '1 Maja 26, Jastrzębie-Zdrój, Silesian Voivodeship, Poland'},
{'name': 'Mleczarnia', 'website': 'http://www.mle.pl', 'address': 'Rabina, Beera Meiselsa 20, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'La Squadra Ristorante', 'website': 'http://lasquadra.pl/restauracja/', 'address': 'Bocheńskiego 109, Katowice, Silesian Voivodeship, Poland'}
]
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.