Issue
I want to web scrape the "search:pc" part of the website called jumia. I wanted to iterate over all the pages ,but unfortunetly it didn't work , I don't why it overwrites the file while it is outside the loop and by using:
with pd.ExcelWriter("output.xlsx", engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
pop.to_excel(writer, sheet_name="sheet1"
instead of:
with open(f"output.xlsx" ,"a") :
with pd.ExcelWriter("output.xlsx") as writer:
pop.to_excel(writer,sheet_name="sheet2")
but it results in an error:
File "c:\Users\hp\Desktop\python_projects\test3.py", line 40, in <module>
find_computers()
File "c:\Users\hp\Desktop\python_projects\test3.py", line 33, in find_computers
with pd.ExcelWriter("output.xlsx", engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\excel\_openpyxl.py", line 61, in __init__
super().__init__(
File "C:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\excel\_base.py", line 1263, in __init__
self._handles = get_handle(
^^^^^^^^^^^
File "C:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\io\common.py", line 872, in get_handle
handle = open(handle, ioargs.mode)
^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'output.xlsx'
this is my actual code:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import openpyxl
from bs4 import Tag
def find_computers():
n=1
while n<=50:
html_text=requests.get(f"https://www.jumia.ma/catalog/?q=pc&page={n}#catalog-listing").text
soup=BeautifulSoup(html_text,"lxml")
computers=soup.find_all("a",class_="core")
df={"price": [],"original price": [],"promo":[]}
computer_name_list=[]
for computer in computers:
computer_name=computer.find("h3",class_="name").text.strip()
price=computer.find("div",class_="prc").text.strip()
original_price_element=computer.find("div",class_="old")
original_price=original_price_element.text.strip() if isinstance(original_price_element, Tag) else "N/A"
promo_element = computer.find("div", class_="bdg _dsct _sm")
promo = promo_element.text.strip() if isinstance(promo_element, Tag) else "N/A"
df["price"].append(price)
df["original price"].append(original_price)
df["promo"].append(promo)
computer_name_list.append(computer_name)
n+=1
pop=pd.DataFrame(df,index=computer_name_list)
pd.set_option('colheader_justify', 'center')
with pd.ExcelWriter("output.xlsx") as writer:
pop.to_excel(writer,sheet_name="sheet2")
if __name__=="__main__":
while True:
find_computers()
time_s = 10
time.sleep(6 * time_s)
Solution
Works also fine for me, so just in addition to avoid the creation of empty dataframe and appending in your loop an example, how to store results and create dataframe after collection:
import pandas as pd
from bs4 import BeautifulSoup
import requests, time
def find_computers():
n=1
data = []
while n<=5:
html_text=requests.get(f"https://www.jumia.ma/catalog/?q=pc&page={n}#catalog-listing").text
soup=BeautifulSoup(html_text,"lxml")
computers=soup.find_all("a",class_="core")
for computer in computers:
data.append({
'name':computer.find("h3",class_="name").text.strip(),
'price':computer.find("div",class_="prc").text.strip(),
'original_price':computer.find("div",class_="old").text.strip() if computer.find("div",class_="old") else None,
'promo_element':computer.find("div", class_="bdg _dsct _sm").text.strip() if computer.find("div", class_="bdg _dsct _sm") else None
})
n+=1
time_s = 10
time.sleep(6 * time_s)
return data
if __name__=="__main__":
data = find_computers()
with pd.ExcelWriter("output.xlsx") as writer:
pd.DataFrame(data).to_excel(writer,sheet_name="sheet1")
Answered By - HedgeHog
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.