Issue
I'm trying to get data from https://www.ouedkniss.com/boutiques/immobilier . I found that ouedkniss.com is using GraphQL API. I tried to use this API but failed to pull data and also to paginate. An error is showing. AttributeError: 'list' object has no attribute 'get'
I don't know if I miss something else here or not. Here is what I tried so far:
import scrapy
import json
from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = json.dumps([
{
"operationName": "SearchStore",
"query": "query Campaign($slug: String!) {\n project(slug: $slug) {\n id\n isSharingProjectBudget\n risks\n story(assetWidth: 680)\n currency\n spreadsheet {\n displayMode\n public\n url\n data {\n name\n value\n phase\n rowNum\n __typename\n }\n dataLastUpdatedAt\n __typename\n }\n environmentalCommitments {\n id\n commitmentCategory\n description\n __typename\n }\n __typename\n }\n}\n",
"variables": {
"q": "", "filter": {
"categorySlug": "immobilier",
"count": 12, "page": 1},
"categorySlug": "immobilier",
"count": 12,
"page": 1
},
}
])
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=payload,
callback=self.parse
)
return super().start_requests()
def parse(self, response):
json_resp = json.loads(response.body)
# print(json_resp)
stores = json_resp.get('data')[0].get('stores').get('data')
for store in stores:
loader = ItemLoader(item=OuedknissItem())
loader.add_value('name', store.get('name'))
yield loader.load_item()
Solution
Your payload json data wasn't well-formatted thats why output was validator errors.Now it's working fine.
import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = json.dumps({
"operationName":"SearchStore",
"variables":{
"q":"",
"filter":{
"categorySlug":"immobilier",
"count":12,
"page":1
}
},
"query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {\n stores: storeSearch(q: $q, filter: $filter) {\n data {\n id\n name\n slug\n description\n imageUrl\n followerCount\n announcementsCount\n url\n mainLocation {\n location {\n region {\n name\n __typename\n }\n city {\n name\n __typename\n }\n __typename\n }\n __typename\n }\n announcements(count: 6, page: 1) {\n data {\n id\n defaultMedia(size: SMALL) {\n mediaUrl\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n paginatorInfo {\n lastPage\n __typename\n }\n __typename\n }\n}\n"
})
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=payload,
callback=self.parse
)
return super().start_requests()
def parse(self, response):
json_resp = json.loads(response.body)
#print(json_resp)
stores = json_resp.get('data').get('stores').get('data')[0]
print(stores)
# loader = ItemLoader(item=OuedknissItem())
# yield loader.load_item()
Output:
{'id': '7088', 'name': 'Rachid Dounia', 'slug': 'rachid-dounia', 'description': 'agence immobiliere', 'imageUrl': 'https://cdn.ouedkniss.com/stores/7088/Logo.jpg', 'followerCount': 4, 'announcementsCount': 11, 'url': '', 'mainLocation': {'location': {'region': {'name': 'Algiers', '__typename': 'Region'}, 'city': {'name': 'Cheraga', '__typename': 'City'}, '__typename': 'Location'}, '__typename': 'StoreLocation'}, 'announcements': {'data': [{'id': '34036104', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33491623', 'defaultMedia': {'mediaUrl': 'https://cdn9.ouedkniss.com/200/medias/announcements/images/pA6vV/4llx7bXtpjVv8196UOgs3ebpXai5HAYl7rs51MAD.jpg', '__typename': 'AnnouncementMedia'}, '__typename': 'Announcement'}, {'id': '33491551', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '27271413', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '33794330', 'defaultMedia': None, '__typename': 'Announcement'}, {'id': '32853052', 'defaultMedia': None, '__typename': 'Announcement'}], '__typename': 'AnnouncementPagination'}, '__typename': 'Store'}
2022-12-13 00:09:28 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-13 00:09:28 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1319,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 3260,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
Update along with payload pagination:
import scrapy
import json
#from ..items import OuedknissItem
from scrapy.loader import ItemLoader
class StoresSpider(scrapy.Spider):
name = 'stores'
allowed_domains = ['www.ouedkniss.com']
def start_requests(self):
payload = {
"operationName":"SearchStore",
"variables":{
"q":"",
"filter":{
"categorySlug":"immobilier",
"count":12,
"page": 1
}},
"query":"query SearchStore($q: String, $filter: StoreSearchFilterInput!) {\n stores: storeSearch(q: $q, filter: $filter) {\n data {\n id\n name\n slug\n description\n imageUrl\n followerCount\n announcementsCount\n url\n mainLocation {\n location {\n region {\n name\n __typename\n }\n city {\n name\n __typename\n }\n __typename\n }\n __typename\n }\n announcements(count: 6, page: 1) {\n data {\n id\n defaultMedia(size: SMALL) {\n mediaUrl\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n paginatorInfo {\n lastPage\n __typename\n }\n __typename\n }\n}\n"
}
headers= {
"Content-Type": "application/json",
# "X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
for payload['variables']['filter']['page'] in range(1,3):
yield scrapy.Request(
url='https://api.ouedkniss.com/graphql',
method="POST",
headers=headers,
body=json.dumps(payload),
callback=self.parse
)
def parse(self, response):
json_resp = json.loads(response.body)
#print(json_resp)
stores = json_resp['data']['stores']['data']
for store in stores:
yield {
'id':store['id']
}
Answered By - Md. Fazlul Hoque
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.