Issue
Im getting the AttributeError: 'NoneType' object has no attribute 'find'
error. Before posting here with some research, possible problem could be that Cloudflare is blocking my access to Spotify. What would be thje workaround to this problem?
part of code looks like this:
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily'
start_date = date(2022,3,1)
end_date = date(2022,4,30)
delta = end_date - start_date
# print(delta.days+1)
for i in range(delta.days+1):
day= start_date +timedelta(days=i)
day_string =day.strftime('%Y-%m-%d')
dates.append(day_string)
def add_url():
for date in dates:
c_string=url+date
urls.append(c_string)
add_url()
def song_scrape(x):
pg = x
for tr in songs.find("tbody").findAll("tr"):
artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
artist = artist.replace("by ", "").strip()
title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
songid = songid.split("track/")[1]
url_date = x.split("daily/")[1]
final.append([title, artist, songid, url_date])
for u in urls:
read_pg= requests.get(u)
sleep(2)
# return read_pg.status_code
soup= BeautifulSoup(read_pg.content, "html.parser")
songs = soup.find("table", {"class": "chart-table"})
song_scrape(u)
final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
with open('spmooddata.csv', 'w') as f:
final_df.to_csv(f, header= True, index=False)
Solution
As I mentioned in my comment, you need the add certain code as is shown in this answer for solve the 403 forbidden error.
After making more changes to your code, I was able to get the data.
This is your modified and working code:
# Library/module imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import pandas as pd
# Variables:
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily/'
start_date = datetime(2022,3,1)
end_date = datetime(2022,3,5)
delta = end_date - start_date
# print(delta.days+1)
for i in range(delta.days+1):
day= start_date + timedelta(days=i)
day_string =day.strftime('%Y-%m-%d')
dates.append(day_string)
def add_url():
for date in dates:
c_string=url+date
urls.append(c_string)
add_url()
def song_scrape(x, songs):
pg = x
for tr in songs.find("tbody").findAll("tr"):
artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
artist = artist.replace("by ", "").strip()
title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
songid = songid.split("track/")[1]
url_date = x.split("daily/")[1]
final.append([title, artist, songid, url_date])
# Avoid http 403 forbidden error with this code:
# Source: https://stackoverflow.com/a/43590290/12511801
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
for u in urls:
read_pg= requests.get(u, headers=header)
sleep(2)
# return read_pg.status_code
soup= BeautifulSoup(read_pg.text, "html.parser")
#Using BeautifulSoup, we're getting the specific data from the HTML:
# There is only 1 table = which is the table with the data to extract:
songs = soup.findAll("table")[0]
# Call "song_scrape" function to retrieve the data from the table:
song_scrape(u, songs)
final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
# print(final_df) # Print the dataframe, if you want
with open('spmooddata.csv', 'w') as f:
final_df.to_csv(f, header= True, index=False)
Answered By - Marco Aurelio Fernandez Reyes
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.