Issue
I am trying to scrape the sci-kit documentation (https://scikit-learn.org/stable/modules/linear_model.html) into a 2 column CSV: the first would be the headings in the documentation and the second column would be the content under that heading. Even if there are multiple paragraphs under the heading, it should be in the same row as the heading it is under.
This is my current code:
from bs4 import BeautifulSoup
import requests
page_link = 'https://scikit-learn.org/stable/modules/linear_model.html'
page_response = requests.get(page_link,verify=False, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")
textContent = []
for tag in page_content.find_all('h2')[1:]:
texth2=tag.text.strip()
textContent.append(texth2)
for item in tag.find_next_siblings('p'):
if texth2 in item.find_previous_siblings('h2')[0].text.strip():
textContent.append(item.text.strip())
Solution
Try:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://scikit-learn.org/stable/modules/linear_model.html"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def get_section(tag):
name = tag.find(["h1", "h2", "h3", "h4"]).text.strip("¶")
text = []
for s in tag.find_all(recursive=False):
if s.name == "section":
yield from get_section(s)
elif s.name not in {"h1", "h2", "h3", "h4"}:
text.append(
s.get_text(strip=True, separator=" ").replace("\n", " ")
)
yield name, " ".join(text).strip()
root = soup.select_one("div[role=main] section")
df = pd.DataFrame(
tuple(
sorted(
get_section(root),
key=lambda v: tuple(map(int, re.findall(r"\d+", v[0]))),
)
),
columns=["Heading", "Value"],
)
print(df.head(10).to_markdown(index=False))
df.to_csv("data.csv", index=False)
Prints the dataframe and saves data.csv
(screenshot from LibreOffice):
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.