Issue
I have a scrapy project, that I try to complete, with limited experience. I've fulfilled nextPage and innerPage operations. My code searches for ads respectively and scrapes data from them one by one. After that, it moves to the next page. I checked all steps with one or two variables.
import scrapy
from urllib.parse import urljoin
from scrapy import signals
from pydispatch import dispatcher
import json
import time
class AtasehirSpider(scrapy.Spider):
name = 'atasehir'
allowed_domains = ['www.sahibinden.com','sahibinden.com']
start_urls = ['https://www.sahibinden.com/satilik-daire/istanbul-atasehir-ornek-ornek-mh.?address_region=2&sorting=price_asc&price_min=805000&price_max=900000']
def parse(self, response):
for ad in response.xpath("//td[@class='searchResultsLargeThumbnail']/a/@href").getall():
time.sleep(10)
ads = response.urljoin(ad)
yield response.follow(url=ads, callback=self.parseInnerPage)
next_page_url = response.xpath("//ul[@class='pageNaviButtons']/li/a[@title='Sonraki']/@href").extract_first()
nextPage = response.urljoin(next_page_url)
if nextPage is not None:
time.sleep(10)
yield scrapy.Request(nextPage)
def parseInnerPage(self, response):
ilan_no = response.xpath("//ul[@class='classifiedInfoList']/li[1]/span/text()").get()
ilan_tarihi = response.xpath("//ul[@class='classifiedInfoList']/li[2]/span/text()").get()
emlak_tipi = response.xpath("//ul[@class='classifiedInfoList']/li[3]/span/text()").get()
metrekare_brut = response.xpath("//ul[@class='classifiedInfoList']/li[4]/span/text()").get()
metrekare_net = response.xpath("//ul[@class='classifiedInfoList']/li[5]/span/text()").get()
oda_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[6]/span/text()").get()
bina_yasi = response.xpath("//ul[@class='classifiedInfoList']/li[7]/span/text()").get()
bulundugu_kat = response.xpath("//ul[@class='classifiedInfoList']/li[8]/span/text()").get()
kat_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[9]/span/text()").get()
isitma = response.xpath("//ul[@class='classifiedInfoList']/li[10]/span/text()").get()
banyo_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[11]/span/text()").get()
balkon = response.xpath("//ul[@class='classifiedInfoList']/li[12]/span/text()").get()
esyali = response.xpath("//ul[@class='classifiedInfoList']/li[13]/span/text()").get()
kullanim_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[14]/span/text()").get()
site_icerisinde = response.xpath("//ul[@class='classifiedInfoList']/li[15]/span/text()").get()
site_adi = response.xpath("//ul[@class='classifiedInfoList']/li[16]/span/text()").get()
aidat = response.xpath("//ul[@class='classifiedInfoList']/li[17]/span/text()").get()
krediye_uygun = response.xpath("//ul[@class='classifiedInfoList']/li[18]/span/text()").get()
tapu_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[19]/span/text()").get()
kimden = response.xpath("//ul[@class='classifiedInfoList']/li[20]/span/text()").get()
fiyat = response.xpath("//div[@class='classifiedInfo ']/h3[contains(text(), 'TL')]/text()").get()
box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
for word in box:
box = word.replace(u"\n", "")
box = word.strip()
set_box =set({box})
#print(set_box)
ozellikler = set_box
item = {
'ilan_no' : ilan_no.strip(),
'fiyat' : fiyat.strip(),
'kimden' : kimden.strip(),
'tapu_durumu' : tapu_durumu.strip(),
'krediye_uygun' : krediye_uygun.strip(),
'aidat' : aidat.strip(),
'site_adi' : site_adi.strip(),
'site_icerisinde' : site_icerisinde.strip(),
'kullanim_durumu' : kullanim_durumu.strip(),
'esyali' : esyali.strip(),
'balkon' : balkon.strip(),
'banyo_sayisi' : banyo_sayisi.strip(),
'isitma' : isitma.strip(),
'kat_sayisi' : kat_sayisi.strip(),
'bulundugu_kat' : bulundugu_kat.strip(),
'bina_yasi' : bina_yasi.strip(),
'oda_sayisi' : oda_sayisi.strip(),
'metrekare_net' : metrekare_net.strip(),
'metrekare_brut' : metrekare_brut.strip(),
'emlak_tipi' : emlak_tipi.strip(),
'ilan_tarihi' : ilan_tarihi.strip(),
'ozellikler' : set_box.getall(),
}
print(item)
Here my code. When I hit enter it gives me the AttributeError
'ozellikler' : set_box.getall(),
AttributeError: 'set' object has no attribute 'getall'
Then, I removed the getall from related variable which is "set_box". So my code turned to:
import scrapy
from urllib.parse import urljoin
from scrapy import signals
from pydispatch import dispatcher
import json
import time
class AtasehirSpider(scrapy.Spider):
name = 'atasehir'
allowed_domains = ['www.sahibinden.com','sahibinden.com']
start_urls = ['https://www.sahibinden.com/satilik-daire/istanbul-atasehir-ornek-ornek-mh.?address_region=2&sorting=price_asc&price_min=805000&price_max=900000']
def parse(self, response):
for ad in response.xpath("//td[@class='searchResultsLargeThumbnail']/a/@href").getall():
time.sleep(10)
ads = response.urljoin(ad)
yield response.follow(url=ads, callback=self.parseInnerPage)
next_page_url = response.xpath("//ul[@class='pageNaviButtons']/li/a[@title='Sonraki']/@href").extract_first()
nextPage = response.urljoin(next_page_url)
if nextPage is not None:
time.sleep(10)
yield scrapy.Request(nextPage)
def parseInnerPage(self, response):
ilan_no = response.xpath("//ul[@class='classifiedInfoList']/li[1]/span/text()").get()
ilan_tarihi = response.xpath("//ul[@class='classifiedInfoList']/li[2]/span/text()").get()
emlak_tipi = response.xpath("//ul[@class='classifiedInfoList']/li[3]/span/text()").get()
metrekare_brut = response.xpath("//ul[@class='classifiedInfoList']/li[4]/span/text()").get()
metrekare_net = response.xpath("//ul[@class='classifiedInfoList']/li[5]/span/text()").get()
oda_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[6]/span/text()").get()
bina_yasi = response.xpath("//ul[@class='classifiedInfoList']/li[7]/span/text()").get()
bulundugu_kat = response.xpath("//ul[@class='classifiedInfoList']/li[8]/span/text()").get()
kat_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[9]/span/text()").get()
isitma = response.xpath("//ul[@class='classifiedInfoList']/li[10]/span/text()").get()
banyo_sayisi = response.xpath("//ul[@class='classifiedInfoList']/li[11]/span/text()").get()
balkon = response.xpath("//ul[@class='classifiedInfoList']/li[12]/span/text()").get()
esyali = response.xpath("//ul[@class='classifiedInfoList']/li[13]/span/text()").get()
kullanim_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[14]/span/text()").get()
site_icerisinde = response.xpath("//ul[@class='classifiedInfoList']/li[15]/span/text()").get()
site_adi = response.xpath("//ul[@class='classifiedInfoList']/li[16]/span/text()").get()
aidat = response.xpath("//ul[@class='classifiedInfoList']/li[17]/span/text()").get()
krediye_uygun = response.xpath("//ul[@class='classifiedInfoList']/li[18]/span/text()").get()
tapu_durumu = response.xpath("//ul[@class='classifiedInfoList']/li[19]/span/text()").get()
kimden = response.xpath("//ul[@class='classifiedInfoList']/li[20]/span/text()").get()
fiyat = response.xpath("//div[@class='classifiedInfo ']/h3[contains(text(), 'TL')]/text()").get()
box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
for word in box:
box = word.replace(u"\n", "")
box = word.strip()
set_box =set({box})
#print(set_box)
ozellikler = set_box
item = {
'ilan_no' : ilan_no.strip(),
'fiyat' : fiyat.strip(),
'kimden' : kimden.strip(),
'tapu_durumu' : tapu_durumu.strip(),
'krediye_uygun' : krediye_uygun.strip(),
'aidat' : aidat.strip(),
'site_adi' : site_adi.strip(),
'site_icerisinde' : site_icerisinde.strip(),
'kullanim_durumu' : kullanim_durumu.strip(),
'esyali' : esyali.strip(),
'balkon' : balkon.strip(),
'banyo_sayisi' : banyo_sayisi.strip(),
'isitma' : isitma.strip(),
'kat_sayisi' : kat_sayisi.strip(),
'bulundugu_kat' : bulundugu_kat.strip(),
'bina_yasi' : bina_yasi.strip(),
'oda_sayisi' : oda_sayisi.strip(),
'metrekare_net' : metrekare_net.strip(),
'metrekare_brut' : metrekare_brut.strip(),
'emlak_tipi' : emlak_tipi.strip(),
'ilan_tarihi' : ilan_tarihi.strip(),
'ozellikler' : set_box
}
print(item)
Here my return:
{'ilan_no': '1028261219', 'fiyat': '870.000 TL', 'kimden': 'Emlak Ofisinden', 'tapu_durumu': 'Kat Mülkiyetli', 'krediye_uygun': 'Evet', 'aidat': 'Belirtilmemiş', 'site_adi': 'Belirtilmemiş', 'site_icerisinde': 'Hayır', 'kullanim_durumu': 'Kiracılı', 'esyali': 'Evet', 'balkon': 'Var', 'banyo_sayisi': '1', 'isitma': 'Doğalgaz (Kombi)', 'kat_sayisi': '3', 'bulundugu_kat': 'Bahçe Katı', 'bina_yasi': '5-10 arası', 'oda_sayisi': '1+1', 'metrekare_net': '50', 'metrekare_brut': '60', 'emlak_tipi': 'Satılık Daire', 'ilan_tarihi': '28 Eylül 2022', 'ozellikler': {'Bahçeli'}}
"ozellikler" variable should include more than 50 items. But only one of them (last one) seems as a return here. I printed set_box, here the actual items for "ozellikler" variable:
{'Güney'}
{'Kuzey'}
{'Amerikan Kapı'}
{'Amerikan Mutfak'}
{'Beyaz Eşya'}
{'Buzdolabı'}
{'Çamaşır Makinesi'}
{'Çelik Kapı'}
{'Duşakabin'}
{'Isıcam'}
{'Kartonpiyer'}
{'Laminat Zemin'}
{'Mutfak (Laminat)'}
{'Mutfak Doğalgazı'}
{'PVC Doğrama'}
{'Seramik Zemin'}
{'Set Üstü Ocak'}
{'Isı Yalıtımı'}
{'Ses Yalıtımı'}
{'Uydu'}
{'Alışveriş Merkezi'}
{'Belediye'}
{'Cami'}
{'Cemevi'}
{'Eczane'}
{'Hastane'}
{'İlkokul-Ortaokul'}
{'İtfaiye'}
{'Lise'}
{'Market'}
{'Park'}
{'Polis Merkezi'}
{'Sağlık Ocağı'}
{'Semt Pazarı'}
{'Spor Salonu'}
{'Şehir Merkezi'}
{'Üniversite'}
{'Anayol'}
{'Avrasya Tüneli'}
{'Boğaz Köprüleri'}
{'Cadde'}
{'Dolmuş'}
{'E-5'}
{'Marmaray'}
{'Metro'}
{'Metrobüs'}
{'Minibüs'}
{'Otobüs Durağı'}
{'TEM'}
{'Doğa'}
{'Şehir'}
{'Bahçe Katı'}
{'Bahçeli'}
How can I get all the items?
Solution
The site doesn't seem to allow access from outside of the country, however I believe the problem is with how you are parsing the box variable.
Try replacing your set_box
loop with this:
...
box = response.xpath("//div[@class='uiBoxContainer classifiedDescription']/ul/li[@class='selected']/text()").getall()
set_box = set()
for word in box:
word = word.replace("\n", "").strip()
set_box.add(word)
...
...
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.