Issue
I've an Excel sheet that contains two columns (Url_id
and Url
). I looped through the Excel file and used Beautifulsoup to get article title and article details from various websites.
Now I want to create a text file using the url_id
as the file name, and then store the output of the website corresponding to the url_id
to the text file.
- The code is scraping all the data I need.
- The text files are being created, but the code is writing same data inside all the text files.
Everything is working well, except that it's looping through the text files many times and then writing same post title and content on all the files.
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
#Reading the excel file
data_ex = pd.read_excel('input.xlsx')
#Getting the url and url_id columns from the excel file
urls = data_ex.URL
url_id = data_ex.URL_ID
# print(url_id)
for url in urls:
#Connecting to each url
res = requests.get(url)
page = res.text
soup = BeautifulSoup(page, 'html.parser')
#Title of each url post
article_title = soup.find_all(name="h1", class_="entry-title")
article_texts = []
article_details = []
for details in article_title:
#print(id)
text = details.getText()
article_texts.append(text)
#Post content corresponding to the url title
article_writeup = soup.find(class_="td-post-content tagdiv-type").getText()
for id in url_id:
for story in article_texts:
#specify folder to create files
folder = 'files_folder'
#Create the folder if it doesn't exist
if not os.path.exists(folder):
os.makedirs(folder)
# List of filenames
filenames = [f"{id}.txt"]
# Loop through the filenames and create text files
for filename in filenames:
file_path = os.path.join(folder, filename)
with open(file_path, 'w', encoding="utf-8") as file:
# Perform any operations you want with the file
file.write((f"{story}\n {article_writeup}"))
#print(f"File '{filename}' created.")
Solution
Check your nested for loops and what you are putting inside of them.
for details in article_title:
for story in article_texts:
for id in url_id:
filenames = [f"{id}.txt"]
for filename in filenames:
file_path = os.path.join(folder, filename)
with open(file_path, 'w', encoding="utf-8") as file:
file.write((f"{story}\n {article_writeup}"))
Read it this way, for every story
in article_text
you are going to be iterating over every id
of url_id
and writing it to a file. So for just one story
, its going to go through all the url ids and write the same story to every file f"{id}.txt"
.
EDIT:
After reading your code I had to kind of guess what you needed as you didn't provide how your data looked (input) and how you wanted it to format (output). Here is an updated version based on your code.
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
data_ex = pd.read_excel("input.xlsx")
urls = data_ex.URL
url_ids = data_ex.URL_ID
folder = "files_folder"
Path(folder).mkdir(parents=True, exist_ok=True)
for url, url_id in zip(urls, url_ids):
res = requests.get(url, timeout=60)
page = res.text
soup = BeautifulSoup(page, "html.parser")
article_title = soup.find_all(name="h1", class_="entry-title")
article_texts = []
article_details = []
for details in article_title:
text = details.getText()
article_writeup = soup.find(class_="td-post-content tagdiv-type").getText()
file_path = Path(folder, f"{url_id}.txt")
with pathlib.Path.open(file_path, "w", encoding="utf-8") as f:
f.write(f"{text}\n {article_writeup}")
Answered By - 0x00
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.