Issue
I want to crawl maritime news from Fleetmon.com news as well as with detail pages and save it in text file. I tried BeautifulSoup in python but it not work properly..
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = 'https://www.fleetmon.com/maritime-news/'
headers = {'User-Agent': 'Mozilla/5.0'}
newslinks = [] # put all item in this array
for x in range(1): # set page range
response = requests.get(
f'https://www.fleetmon.com/maritime-news/?page={x}') # url of next page
soup = BeautifulSoup(response.content, 'html.parser')
newslist = soup.find_all('article')
# loop to get all href from ul
for item in newslist:
for link in item.find_all('a', href=True):
newslinks.append(link['href'])
newslinks = list(set(newslinks))
print(newslinks)
# news details pages
newsdata = []
for link in newslinks:
print(link)
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
shipName = soup.find('div', {'class': 'uk-article-story'}).text.strip()
fieldsets = soup.find_all('article')
row = {'Ship Name': shipName}
for fieldset in fieldsets:
dts = fieldset.find_all('h1')
for dt in dts:
row.update({dt.text.strip(): dt.find_next('p').text.strip()})
newsdata.append(row)
#text or csv
df = pd.DataFrame(newsdata)
df.to_csv (r'C:\Users\Usuario\Desktop\news.csv', index = False, header=True)
print(df)
Help me to improve my code to get all data in text form.
Also is it possible to crawl data and save it csv like this:
- Column1:News_title:value
- column2:category: accidents
- column3:publish_date_time:June 28, 2022 at 13:31
- column4:news:full news here
Solution
Go to the details page (here I use req2 to go to the details page) and I 've made the pagination using for loop and range function and you can increase or decrease the page numbers with no time.
P/S: If you click on any title link then you can see the details page and from the details pages are scraped all required data items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
url='https://www.fleetmon.com/maritime-news/?page={page}'
data=[]
for page in range(1,11):
req = requests.get(url.format(page=page),headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
for link in soup.select('.news-headline h2 a') :
link='https://www.fleetmon.com' + link.get('href')
req2 = requests.get(link,headers=headers)
soup2 = BeautifulSoup(req2.text, 'lxml')
title= soup2.find('h1',class_="uk-article-title margin-t-0").text
cat=soup2.select_one('p.uk-article-meta span a strong').text
date=soup2.select_one('[class="uk-text-nowrap"]:nth-child(3)').text
details=soup2.select_one('.uk-article-story ').get_text(strip=True)
data.append({
'title':title,
'category':cat,
'date':date,
'details_news':details
})
df = pd.DataFrame(data)#.to_csv('news.csv',index=False)
print(df)
Output:
Cruise ship NORWEGIAN SUN hit iceberg, damaged... ... Cruise ship NORWEGIAN SUN hit an
iceberg size ...
1 Yang Ming and HMM Were Accused of Collusion to... ... YM WARRANTY by ship spotter phduck2kYM WARRANT...
2 Fire in bulk carrier cargo hold, Florida ... At around 2350 LT Jun 26 firefighters responde...
3 Chlorine gas tank fell on Chinese cargo ship, ... ... Tank with 25 tons of chlorine gas fell onto ca...
4 Heavy vehicle fell onto cargo deck during offl... ... Heavy machinery vehicle (probably mobile crane...
.. ... ...
...
195 Yara Plans 15 Ammonia Bunkering Terminals in S... ... VIKING ENERGY by ship spotter PattayaVIKING EN...
196 World’s Largest Electric Cruise Ship Sets Sail... ... ©Wuxi Saisiyi Electric Technology,©Wuxi Saisiy...
197 The Supply Chain Crisis Brewing at Israeli Ports ... Port Haifa in FleetMon ExplorerPort Haifa in F...
198 CDC Drops Its “Cruise Ship Travel Health Notic... ... AIDADIVA by ship spotter Becks93AIDADIVA by sh...
199 Scorpio Tankers Take the Path of Shipboard Car... ... CORONA UTILITY by ship spotter canonbenqCORONA...
[200 rows x 4 columns]
Answered By - F.Hoque
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.