Issue
I can't seem to generate output for anything beyond page 1 (one page has 15 restaurants and that's all I am getting (just 15 output). It looks like the output from page one gets replaced by page two and so forth.
I have tried adding in the page range to scrape but still came back with only 15 results (scraping only just one page).
import requests
import pandas
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
for num in range(1,5):
url = 'https://www.zomato.com/auckland/restaurants?gold_partner=1&page={}'.format(num)
response = requests.get(url,headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div",attrs={"class": "col-s-16 search_results mbot"})
list_tr = top_rest[0].find_all("div",attrs={"class": "js-search-result-li even status 1"})
list_rest =[]
for tr in list_tr:
dataframe ={}
dataframe["1.rest_name"] = (tr.find("a",attrs={"class": "result-title hover_feedback zred bold ln24 fontsize0"})).text.replace('\n', ' ')
dataframe["2.rest_address"] = (tr.find("div",attrs={"class": "col-m-16 search-result-address grey-text nowrap ln22"})).text.replace('\n', ' ')
list_rest.append(dataframe)
list_rest
df = pandas.DataFrame(list_rest)
df.to_csv("zomato_res26.csv",index=False)
I expect to get a list of output of 40+ restaurants with their names and location, but so far it looks like I'm only getting 15 restaurants per page where
Solution
Change the indentation and move the list creation, list_rest
, out of the loop and append to it in the loop. Also, change the encoding for output to encoding='utf-8-sig'
to properly handle the characters present. You can get the number of pages with int(soup.select_one('.pagination-number b:last-child').text)
.
I've also added requests.Session()
to re-use connection.
import requests
import pandas
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
list_rest =[]
with requests.Session() as s:
for num in range(1,5):
url = 'https://www.zomato.com/auckland/restaurants?gold_partner=1&page={}'.format(num)
response = s.get(url,headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div",attrs={"class": "col-s-16 search_results mbot"})
list_tr = top_rest[0].find_all("div",attrs={"class": "js-search-result-li even status 1"})
for tr in list_tr:
dataframe ={}
dataframe["1.rest_name"] = (tr.find("a",attrs={"class": "result-title hover_feedback zred bold ln24 fontsize0"})).text.replace('\n', ' ')
dataframe["2.rest_address"] = (tr.find("div",attrs={"class": "col-m-16 search-result-address grey-text nowrap ln22"})).text.replace('\n', ' ')
list_rest.append(dataframe)
df = pandas.DataFrame(list_rest)
df.to_csv(r"zomato_res26.csv", sep=',', encoding='utf-8-sig',index = False )
If you want to loop all pages and use faster selectors with list comprehensions:
import requests
import pandas
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
list_rest =[]
def getInfo(soup):
names = [item.text.strip() for item in soup.select('.result-title')]
addresses = [item.text.strip() for item in soup.select('.search-result-address')]
row = list(zip(names, addresses))
return row
with requests.Session() as s:
url = 'https://www.zomato.com/auckland/restaurants?gold_partner=1&page={}'
response = s.get(url.format(1),headers=headers)
content = response.content
soup = BeautifulSoup(content,"lxml")
numPages = int(soup.select_one('.pagination-number b:last-child').text)
list_rest.append(getInfo(soup))
if numPages > 1:
for page in range(2, numPages + 1):
response = s.get(url.format(page),headers=headers)
content = response.content
soup = BeautifulSoup(content,"lxml")
list_rest.append(getInfo(soup))
final_list = [item for sublist in list_rest for item in sublist]
df = pandas.DataFrame(final_list, columns = ['1.rest_name', '2.rest_address'])
df.to_csv(r"zomato_res26.csv", sep=',', encoding='utf-8-sig',index = False )
Answered By - QHarr
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.