Issue
I have been trying to scrape medium content but was aunable to get all the h1 tag, I was able to get all p-tag all to the end but the h1-tag is missing in between the text
I want to be able to scrape all the content in order of appearance along with all the subheadings in h1 tag
this is what i have done
import stuff
import requests
import bs4
import os
import shutil
from PIL import Image
article_URL = 'https://medium.com/bhavaniravi/build-your-1st-python-web-app-with-flask-b039d11f101c' #@param {type:"string"}
# article_URL = 'https://www.tmz.com/2020/07/29/dr-dre-answers-wife-divorce-petition-prenup/'
response = requests.get(article_URL)
soup = bs4.BeautifulSoup(response.text,'html')
paragraphs = soup.find_all(['li', 'p', 'strong', 'em'])
title = soup.find(['h1','title']).get_text()
print(title)
txt_list = []
tag_list = []
with open('content2.txt', 'w') as f:
f.write(title + '\n\n')
for p in paragraphs:
if p.href:
pass
else:
if len(p.get_text()) > 100: # this filters out things that are most likely not part of the core article
# print(p.href)
tag_list.append(p.name)
txt_list.append(p.get_text())
txt_list2 = []
tag_list2 = []
for i in range(len(txt_list)):
# if '\n' not in txt_list[i]:
print(txt_list[i])
# print(len(txt_list[i]))
# print(tag_list[i])
print()
comp1 = txt_list[i].split()[0:5]
comp2 = txt_list[i-1].split()[0:5]
if comp1 == comp2:
pass
else:
pass
Solution
So you need to add the h1
tag to your paragraphs list by doing:
paragraphs = soup.find_all(['li', 'p', 'strong', 'em', 'h1'])
Also title
should look like that:
title = soup.find(['title']).get_text()
Now all h1
are in your paragraphs list. Instead of somehow saving them into another array and puzzling them somehow together again I just would put the <h1>
tags also in there and later check which tag it is and execute different code for different tags like <p>
or <h1>
.
import requests
import bs4
import os
import shutil
from PIL import Image
article_URL = 'https://medium.com/bhavaniravi/build-your-1st-python-web-app-with-flask-b039d11f101c' #@param {type:"string"}
response = requests.get(article_URL)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all(['li', 'p', 'strong', 'em', 'h1'])
title = soup.find(['title']).get_text()
print(title)
tag_list = []
with open('content2.txt', 'w') as f:
f.write(title + '\n\n')
for p in paragraphs:
if not p.href:
if len(p.get_text()) > 5:
tag_list.append(p)
for i in range(len(tag_list)):
text = tag_list[i].get_text()
if tag_list[i].name in ["p", "li", "strong", "em"]:
# Code run when text is in a <p> html tag
print(f"p: {text}")
elif tag_list[i].name in ["h1"]:
# Code run when text is in a <h1> html tag
print(f"h1: {text}")
This will run different code for <p>
and for the <h1>
html tags.
Answered By - sp4c38
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.