Issue
from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1
Solution
This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]
Answered By - platipus_on_fire
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.