Issue
The variable headline_html comes back as None when I try to use the element_target element as an argument in soup.find().
If I hard code the element_target string in soup.find(), it works. Like this: soup.find("h2", class_="story__title"). And printing out element_string gives this string.
There is no issue with the html target.
I've tried reversing the " and ' in paper[3] to no avail. setting it as a variable before using it in soup.find() also to no avail.
What am I missing?
from difflib import restore
from venv import create
from requests import get
import random
import psycopg2
from bs4 import BeautifulSoup
import database
import requests
import datetime
papers = [
[1, "https://www.mirror.co.uk/", "The Daily Mirror", '"h2", class_="story__title"'],
[2, "https://www.theguardian.com/uk", "The Guardian", "'span', class_='js-headline-text'"],
[3, "https://www.thesun.co.uk/", "The Sun", "'p', class_='teaser__subdeck'"],
[4, "https://www.ft.com/world/uk", "The Financial Times", "'div', class_='o-teaser__heading'"],
[5, "https://www.dailymail.co.uk/home/index.html", "The Daily Mail", "'h2', class_='linkro-darkred'"],
[6, "https://www.thetimes.co.uk/uk", "The Times", "'h3', class_='Headline--xl'"]
]
def scrapeHeadlines():
scrape_results = []
randomUrls = [
"https://www.facebook.com/",
"https://www.google.co.uk",
"https://www.twitter.com"
]
headers = {
'User-Agent': 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Referer': random.choice(randomUrls)
}
fail = "Sorry but we could not get the headline for " + str(paper[2])
timestamp = '{:%b-%d-%Y %H:%M:%S}'.format(datetime.datetime.now())
id = paper[0]
url = paper[1]
newspaper = paper[2]
element_target = paper[3]
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser") # this can be printed out and works
# headline_html = soup.find(paper[3])
headline_html = soup.find(element_target) # this is not working like this
if headline_html != None:
headline = headline_html.text.strip()
else:
headline = fail
scrape_results.append({
'id': id,
'paper': newspaper,
'headline': headline,
'headline_html': headline_html
})
print(scrape_results)
for paper in papers:
scrapeHeadlines()
The result is:
{'id': 1, 'paper': 'The Daily Mirror', 'headline': 'Sorry but we could not get the headline for The Daily Mirror', 'headline_html': None}]
[{'id': 2, 'paper': 'The Guardian', 'headline': 'Sorry but we could not get the headline for The Guardian', 'headline_html': None}]
[{'id': 3, 'paper': 'The Sun', 'headline': 'Sorry but we could not get the headline for The Sun', 'headline_html': None}]
[{'id': 4, 'paper': 'The Financial Times', 'headline': 'Sorry but we could not get the headline for The Financial Times', 'headline_html': None}]
[{'id': 5, 'paper': 'The Daily Mail', 'headline': 'Sorry but we could not get the headline for The Daily Mail', 'headline_html': None}]
[{'id': 6, 'paper': 'The Times', 'headline': 'Sorry but we could not get the headline for The Times', 'headline_html': None}]
Solution
You've stringified the parameters of find()
. You need to split these out:
papers = [
[1, "https://www.mirror.co.uk/", "The Daily Mirror", "h2", "story__title"],
...
]
def scrapeHeadlines(paper):
...
element_target = paper[3]
class_ = paper[4]
headline_html = soup.find(element_target, class_=class_)
...
for paper in papers:
scrapeHeadlines(paper)
Answered By - quamrana
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.