Issue
I am trying to get some information from the website Truecar but with my code, each time it wants to get the information from next page, it repeats scraping page 1.How can I scrape multiple pages without the first page being repeated with beautifulsoup? thanks
carname=input('Please enter the name of car: ')
import requests
from bs4 import BeautifulSoup
import re
import mysql.connector
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
cnx = mysql.connector.connect(user='root',host='127.0.0.1',database='info',password='nooshika')
cursor=cnx.cursor()
listname=[]
listprice=[]
listtmile=[]
listyearmodel=[]
list2=[]
final_list=[]
#response=requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=1')
#soup=BeautifulSoup(response.text,'html.parser')
def scrape(response,soup):
for link in soup.find_all('span',class_="truncate"):
link=link.get_text()
link=re.sub(r'Excellent Price',' ',str(link))
listname.append(link)
for j in listname:
if j==' ':
listname.remove(j)
for i in soup.find_all('div',class_="truncate text-xs",attrs={'data-test':"vehicleMileage"}):
listtmile.append(i.get_text())
for k in soup.find_all('span',attrs={'data-test':"vehicleListingPriceAmount"}):
k=k.get_text()
listprice.append(k)
for l in soup.find_all('span',class_="vehicle-card-year text-xs"):
l=l.get_text()
listyearmodel.append(l)
for k,i2 in enumerate(listname):
for k1,i3 in enumerate(listprice):
for k2,i4 in enumerate(listtmile):
for k3,i5 in enumerate(listyearmodel):
if k==k1==k2==k3:
list2.append((i5,i2,i3,i4))
pages = list(range(1,3))
for count in pages:
response=requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page={}'.format(count))
soup=BeautifulSoup(response.text,'html.parser')
scrape(response,soup)
def contains_carname(a):
return isinstance(a, str) and carname in a
matches = [x for x in list2 if contains_carname(x[1])]
while matches==[]:
matches = [x for x in list2 if contains_carname(x[1])]
if matches!=[]:
break
print('No match.Please enter again')
carname=input('Please enter the name of car: ')
print(matches)
Solution
In your current implementation, you're appending data to lists outside the loop that scrapes each page. This leads to the accumulation of data from previous pages every time a new page is scraped. To fix this issue, you should clear your lists before scraping each new page.
Try using this:
# ... (your import statements and database connection)
carname = input('Please enter the name of the car: ')
# ... (your list initialization and functions)
pages = list(range(1, 3))
for count in pages:
response = requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page={}'.format(count))
soup = BeautifulSoup(response.text, 'html.parser')
# Clear lists before scraping each page
listname.clear()
listprice.clear()
listtmile.clear()
listyearmodel.clear()
scrape(response, soup)
# ... (your matching and printing code)
By calling the clear() method on your lists before scraping each page, you ensure that you're starting with empty lists for each page and prevent accumulation of data from previous pages.
Answered By - Ketan
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.