Issue
hey i want to scrape movies from this site https://www.yidio.com/movies but it only gets 100 movies no more . i found that when scroll down in the browser the html code changes and hidden movies appears it seems like it adds 100 more movies.
i want to scrape 1000 movies at least, how i can do that in beautiful soup?
that's some of my pyhton code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Define the URL of the movie page
list_url = 'https://www.yidio.com/movies'
# Send a GET request to the URL
response = requests.get(list_url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'lxml')
# Find all the movie links on the page
movie_links = soup.find_all('a', class_='card movie')
# Extract the movie URLs
urls = [link['href'] for link in movie_links]
movies_info = {
'Title': [],
'Genres': [],
'Cast': [],
'Director': [],
'Release Date': [],
'MPAA Rating': [],
'Runtime': [],
'Language': [],
'IMDB Rating': [],
'Metascore': []
}
def get_movie_info(url):
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'lxml')
# Extract specific information from the page
'''
Extract movie title, genres, cast, director, release date, MPAA rating, runtime, language,
IMDB rating, meta score
'''
# extract title of the movie
movie_title = soup.find('h1').text.strip().split(' ', 1)[1]
print('processing movie ' + movie_title + '...')
'''
some other code ...
'''
def main():
# Iterate over the URLs and scrape movie information
for url in urls:
get_movie_info(url)
# Create a DataFrame from the movie information
movies_df = pd.DataFrame(movies_info)
movies_df = movies_df.dropna(subset=['MPAA Rating', 'IMDB Rating', 'Metascore'])
# Save the movie information to a CSV file with index column and named 'id'
movies_df.to_csv('movies_info.csv', index_label='id')
print('==========================================================')
print('\nMovie information saved to movies_info.csv successfully.')
if __name__ == '__main__':
main()
Solution
To get names and URLs of all movies you can use their pagination API:
import requests
import pandas as pd
api_url = 'https://www.yidio.com/redesign/json/browse_results.php'
params = {"type": "movie", "index": "0", "limit": "100"}
all_data = []
for params['index'] in range(0, 1500, 100): # <-- increase the max index here (0, 100, 200, 300, ...)
print(f"{params['index']=}")
data = requests.get(api_url, params=params).json()
all_data.extend(data['response'])
df = pd.DataFrame(all_data)
print(df.tail())
Prints:
name type id url image
1495 Sabrina movie 14361 https://www.yidio.com/movie/sabrina/14361 //cfm.yidio.com/images/movie/14361/poster-193x290.jpg
1496 When Harry Met Sally... movie 11237 https://www.yidio.com/movie/when-harry-met-sally/11237 //cfm.yidio.com/images/movie/11237/poster-193x290.jpg
1497 Roll Bounce movie 23861 https://www.yidio.com/movie/roll-bounce/23861 //cfm.yidio.com/images/movie/23861/poster-193x290.jpg
1498 The Holiday movie 24956 https://www.yidio.com/movie/the-holiday/24956 //cfm.yidio.com/images/movie/24956/poster-193x290.jpg
1499 Juneteenth movie 229233 https://www.yidio.com/movie/juneteenth/229233 //cfm.yidio.com/images/movie/229233/poster-193x290.jpg
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.