Issue
I want to clean up scraped text from <dd>
and <dt>
, output looks like this:
Įrengimas:': 'Dalinė apdailaNAUDINGA:Interjero dizaineriai', 'Ypatybės:': 'Nauja kanalizacijaNauja elektros instaliacija',
My desired outputd:
Įrengimas:': 'Dalinė apdaila, 'Ypatybės:': 'Nauja kanalizacija, Nauja elektros instaliacija',
My code block, to get that text:
def get_dl(soup):
d_list = {}
for dl in soup.findAll("dl", {"class": "obj-details"}):
for el in dl.find_all(["dt", "dd"]):
if el.name == 'dt':
key = el.get_text(strip=True)
elif key in ['Plotas:', 'Buto numeris:', 'Metai:', 'Namo numeris:', 'Kambarių sk.:', 'Aukštas:', 'Aukštų sk.:', 'Pastato tipas:', 'Šildymas:', 'Įrengimas:', 'Pastato energijos suvartojimo klasė:', 'Ypatybės:', 'Papildomos patalpos:', 'Papildoma įranga:', 'Apsauga:']:
d_list[key] = el.get_text(strip=True)
return d_list
So, what is best practice to fix text in this situation ?
Adding full code, to see whole picture:
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import csv
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
data = []
def get_dl(soup):
d_list = {}
for dl in soup.findAll("dl", {"class": "obj-details"}):
for el in dl.find_all(["dt", "dd"]):
if el.name == 'dt':
key = el.get_text(strip=True)
elif key in ['Plotas:', 'Buto numeris:', 'Metai:', 'Namo numeris:', 'Kambarių sk.:', 'Aukštas:', 'Aukštų sk.:', 'Pastato tipas:', 'Šildymas:', 'Įrengimas:', 'Pastato energijos suvartojimo klasė:', 'Ypatybės:', 'Papildomos patalpos:', 'Papildoma įranga:', 'Apsauga:']:
d_list[key] = el.get_text(strip=True)
return d_list
for puslapis in range(2, 3):
driver.get(f'https://www.aruodas.lt/butai/vilniuje/puslapis/{puslapis}')
response = driver.page_source
soup = BeautifulSoup(response, 'html.parser')
blocks = soup.find_all('tr', class_='list-row')
stored_urls = []
for url in blocks:
try:
stored_urls.append(url.a['href'])
except:
pass
for link in stored_urls:
driver.get(link)
response = driver.page_source
soup = BeautifulSoup(response, 'html.parser')
h1 = soup.find('h1', 'obj-header-text')
price = soup.find('div', class_ = 'price-left')
try:
address1 = h1.get_text(strip=True)
address2 = re.findall(r'(.*),[^,]*$', address1)
address = ''.join(address2)
city, district, street = address.split(',')
except:
address = 'NaN'
try:
full_price = price.find('span', class_ = 'price-eur').text.strip()
except:
full_price = 'NaN'
try:
price_sq_m = price.find('span', class_ = 'price-per').text.strip()
except:
price_sq_m = 'NaN'
try:
price_change = price.find('div', class_ = 'price-change').text.strip()
except:
price_change = 'NaN'
data.append({'city': city, 'district': district, 'street': street, 'full_price': full_price, 'price_sq_m': price_sq_m, 'price_change:': price_change, **get_dl(soup)})
for entry in data:
print(entry)
driver.quit()
Solution
Labas. Is the "Naudinga" part the main issue, are there any other words/phrases that can occur? If not, try this.
#Replace your line code:
d_list[key] = el.get_text(strip=True)
#With this mess:
d_list[key] = ' '.join(el.text.strip().replace("\n", ", ").split('NAUDINGA')[0].split())
Answered By - Gedas Miksenas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.