Issue
I am trying to make a list of all the stocks listed in the table or yahoo finance's Top undervalued growth stocks In this program, I am trying to print out the tags of all the table rows however only the tags of the first 25 are printing.
from bs4 import BeautifulSoup
import requests
import re
offset = 0
TableUrl = f"https://finance.yahoo.com/screener/predefined/undervalued_growth_stocks/?count=100&offset={offset}"
TablePage = requests.get(TableUrl).text
TableDoc = BeautifulSoup(TablePage, "html.parser")
#finds the amount of stocks listed
numOfStocksParent = TableDoc.find(class_="Mstart(15px) Fw(500) Fz(s)")
numOfStocks = int(numOfStocksParent.string.split("of ")[-1].split(" results")[0])
while offset < numOfStocks:
LoopTableUrl = f"https://finance.yahoo.com/screener/predefined/undervalued_growth_stocks/?count=100&offset={offset}"
LoopTablePage = requests.get(LoopTableUrl).text
LoopTableDoc = BeautifulSoup(LoopTablePage, "html.parser")
LoopTable = LoopTableDoc.find("table")
LoopTableRows = LoopTable.find_all("tr")
for row in LoopTableRows:
print(row.find("a"))
offset += 25
I tried to add an offset that increases each time the while loops is looped through allowing me to view the next 25 stocks, however, even in the loop, the output just keeps returning the first 25 table row's even though the offset is increasing.
Solution
The problem is with missing headers. I made few other changes like directly looping using rows and using select
instead of find
. They won't make much difference.
from bs4 import BeautifulSoup
import requests
# check if some <tr> are missing
def get_col_value(value):
if value: return value.text
else: return '-'
# extract data
def parse_page(html):
soup = BeautifulSoup(html, "html.parser")
page_data = []
# get get column names
columns = [th.text for th in soup.select('#scr-res-table table > thead th')]
# get row elements
rows = soup.select('#scr-res-table table > tbody > tr')
# loop over rows and columns
for row in rows:
# create a dict of row items
item = {}
for n, col in enumerate(columns):
item[col] = get_col_value(row.select_one(f'td:nth-child({n+1})'))
# append that dict to page_data list
page_data.append(item)
return page_data
HEADERS = {'Connection': 'keep-alive','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
url = "https://finance.yahoo.com/screener/predefined/undervalued_growth_stocks/"
offset = 0
# constant increment for count and offset
INCREMENT = 100
data = []
while True:
# sending requests with count, offset and header
res = requests.get(f'{url}?count={INCREMENT}&offset={offset}', headers=HEADERS)
offset += INCREMENT
print("In page", int(offset/INCREMENT))
page_data = parse_page(res.text)
print(page_data)
print("number of rows:", len(page_data))
# request always gives 200 status even with empty page_data, so checking length of rows
if not len(page_data): break
data.extend(page_data)
input()
print(data)
print("Total:", len(data))
Answered By - Reyot
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.