Issue
I'm not sure how I can count a URL on a page just once.
For example, this page https://www.ig.com/uk/news-and-trade-ideas/ includes the article https://www.ig.com/uk/news-and-trade-ideas/early-morning-call--221103 4 times in different sections.
How can I record it just once.
from cgitb import text
import requests
from bs4 import BeautifulSoup
import gspread
import datetime
import urllib
from urllib.parse import urlparse
# Connect to Google Sheet and select sheet
gc = gspread.service_account(filename='creds.json')
sh = gc.open('scrapetosheets').sheet1
# Add URLs to inspect
urls = ["https://www.ig.com/uk/trading-strategies",
"https://www.ig.com/uk/news-and-trade-ideas",
"https://www.ig.com/us/trading-strategies",
"https://www.ig.com/us/news-and-trade-ideas",
"https://www.ig.com/en/trading-strategies",
"https://www.ig.com/en/news-and-trade-ideas",
"https://www.ig.com/za/trading-strategies",
"https://www.ig.com/za/news-and-trade-ideas",
"https://www.ig.com/au/trading-strategies",
"https://www.ig.com/au/news-and-trade-ideas",
"https://www.ig.com/fr/strategies-de-trading",
"https://www.ig.com/fr/marche-actualites-et-idees-de-trading",
"https://www.ig.com/de/trading-strategien",
"https://www.ig.com/de/nachrichten-und-trading-ideen",
"https://www.ig.com/it/strategie-di-trading",
"https://www.ig.com/it/news-e-idee-di-trading",
"https://www.ig.com/es/estrategias-de-trading",
"https://www.ig.com/es/ideas-de-trading-y-noticias",
"https://www.ig.com/en-ch/trading-strategies",
"https://www.ig.com/en-ch/news-and-trade-ideas",
"https://www.ig.com/cn/trading-strategies",
"https://www.ig.com/cn/news-and-trade-ideas",
"https://www.ig.com/se/tradingstrategier",
"https://www.ig.com/se/nyheter-och-trading-ideer",
"https://www.ig.com/nl/nieuws-en-trading-ideeen",
"https://www.ig.com/nl/trading-strategieen",
"https://www.ig.com/jp/trading-strategies",
"https://www.ig.com/jp/news-and-trade-ideas"]
# Add array
obj = {r[2]: True for r in sh.get_all_values()}
ar = []
for url in urls:
my_url = requests.get(url)
html = my_url.content
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("h3", class_="article-category-section-title"):
date = datetime.date.today()
title = item.find("a", class_="primary js_target").text.strip()
url = item.find("a", class_="primary js_target").get("href")
abs = "https://www.ig.com"
rel = url
pub = rel[-6:]
datestring = f"{pub[4:6]} {pub[2:4]} {pub[0:2]}"
info = {"date": date, "title": title, "url":urllib.parse.urljoin(abs, rel), "published":datestring}
url = str(info["url"].replace("https://",""))
if url not in obj:
ar.append([str(info["date"]), str(info["title"]), url, str(info["published"])])
if ar != []:
sh.append_rows(ar, value_input_option="USER_ENTERED")sHH"
Solution
There are several approaches:
Append urls to
list
and lookup while iterating if actual url is not inlist
else skip scraping it.Or as mentioned use
set
and operate from the links:articles = [] for e in set(soup.select('h3>a')): e = e.find_parent('div') articles.append({ 'url':e.a.get('href'), 'title':e.get_text(strip=True), 'date':e.select_one('.article-category-section-date').get_text(strip=True) if e.select_one('.article-category-section-date') else None }) articles
Or collect your information in
list
ofdicts
and iterate over values to get unique one:list({v['url']:v for v in articles}.values())
...
Example
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.ig.com/uk/news-and-trade-ideas/')
soup = BeautifulSoup(r.content)
articles = []
for e in soup.select('h3:has(>a)'):
articles.append({
'url':e.a.get('href'),
'title':e.get_text(strip=True)
})
print('Whit duplicates: ',len(articles))
print('Whitout duplicates: ', len(list({v['url']:v for v in articles}.values())))
list({v['url']:v for v in articles}.values())
Output
With duplicates: 36
Without duplicates: 29
[{'url': '/uk/news-and-trade-ideas/_brent-crude-oil--gold-and-us-natural-gas-rallies-pause-amid-us--221108',
'title': '\u200bBrent crude oil, gold and US natural gas ral...'},
{'url': '/uk/news-and-trade-ideas/early-morning-call--gloomy-festive-season-ahead-amid-consumer-we-221108',
'title': 'Early Morning Call: dollar basket steady ahead of ...'},
{'url': '/uk/news-and-trade-ideas/nasdaq-listed-ryanair-posts-record-h1-results-221107',
'title': 'Ryanair shares up after record H1 result...'},...]
Answered By - HedgeHog
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.