Up until 3 days ago I was able to scrape the target site. However, it started showing the error I will post below. When I looked at the source code of the site, I could not see any changes. It also returns as scrapy (200) response. I am using proxy and user-agent. I changed them but still the same result. I keep getting json decode error.
File "/usr/lib/python3.8/json/", line 355, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
My code:
import scrapy
import json
import datetime
import bs4
import re
import time
from requests.models import PreparedRequest
import logging
from hepsibura_spider.items import HepsiburaSpiderItem
from scrapy.crawler import CrawlerProcess
class HepsiburaSpider(scrapy.Spider):
name = 'hepsibura'
# allowed_domains = ['']
handle_httpstatus_list = [301]
def start_requests(self):
urls = [
for url in urls:
params = []
# added a meta to provide the used url here
main_url, parameters = url.split('&') if '&' in url else url, None
parameters = parameters.split(':') if parameters else []
for parameter in parameters:
key, value = parameter.split('=')
params.append((key.strip(), value.strip()))
# params.append(('main_url', main_url))
if 'sayfa' not in dict(params):
params.append(('sayfa', '1'))
yield scrapy.Request(
'main_url': main_url,
'params': dict(params),
'Cache-Control': 'store, no-cache, must-revalidate, post-check=0, pre-check=0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.5134.152 Safari/537.36',
def parse_json(self, response):
if response.status == 301:
logging.log(logging.INFO, 'Finished scraping')
current_url = response.request.url.split('&')[0].strip()
parameters = response.meta.get('params')
soup = bs4.BeautifulSoup(response.text,'lxml')
scripts ='script')
data_script = ''
for script in scripts:
# print(script.text)
if 'window.MORIA.PRODUCTLIST = {' in str(script):
print('Found the data')
data_script = str(script)
data_script = data_script.replace('<script type="text/javascript">','').replace('window.MORIA = window.MORIA || {};','').replace('window.MORIA.PRODUCTLIST = {','').replace('\'STATE\': ', '').replace('</script>','')[:-4]
json_data = json.loads(data_script)
products = json_data['data']['products']
for product in products:
item = HepsiburaSpiderItem()
item['rowid'] = hash(str( + str(product['productId']))
item['date'] = str(
item['listing_id'] = product['variantList'][0]["listing"]["listingId"]
item['product_id'] = product['variantList'][0]["sku"].lower()
item['product_name'] = product['variantList'][0]['name']
item['price'] = float(product['variantList'][0]['listing']['priceInfo']['price'])
item['url'] = '' + product['variantList'][0]["url"]
item['merchantName'] = product['variantList'][0]["listing"]["merchantName"].lower()
yield item
parameters['sayfa'] = int(parameters['sayfa']) + 1
req = PreparedRequest()
req.prepare_url(current_url, parameters)
yield scrapy.Request(
'params': parameters,
'Cache-Control': 'store, no-cache, must-revalidate, post-check=0, pre-check=0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.5134.152 Safari/537.36',
if __name__ == '__main__':
process = CrawlerProcess()
I found somethings. Site changed their json format. Every requests generate unique id:
window.MORIA.PRODUCTLIST = Object.assign(window.MORIA.PRODUCTLIST || {}, {
'60cada8e-57dd-466e-f7af-62efca4fa8a8': {
How can I bypass this?
Thank you.
There's really no need to use BeautifulSoup with scrapy.
The problem is that data_script
is empty.
Get rid of the loop, just use xpath to select the script
tag that has that text, and use re_first() function to get the JSON string.
Also you might want to check that data
is not empty for later uses.
# soup = bs4.BeautifulSoup(response.text, 'lxml')
# scripts ='script')
# data_script = ''
# for script in scripts:
# # print(script.text)
# if 'window.MORIA.PRODUCTLIST = {' in str(script):
# print('Found the data')
# data_script = str(script)
# break
data = response.xpath('//script[contains(text(), "window.MORIA.PRODUCTLIST")]/text()').re_first(r'\'STATE\': ({.+})')
#data_script = response.xpath('//script[contains(text(), "window.MORIA.PRODUCTLIST")]/text()').re()
#data_script = data_script.replace('<script type="text/javascript">', '').replace('window.MORIA = window.MORIA || {};', '').replace('window.MORIA.PRODUCTLIST = {', '').replace('\'STATE\': ', '').replace('</script>', '')[:-4]
json_data = json.loads(data)
products = json_data['data']['products']
{'rowid': -1443611402678861624, 'date': '2022-09-18 16:25:58.168075', 'listing_id': 'fd2eb812-f483-4233-bfea-610490e16014', 'product_id': 'hbcv000013tlaw', 'product_name': 'MSI PRO\xa016T 10M-043TR Intel Celeron 5205U 4GB\xa0128GB SSD Windows 10 Pro 15.6" All In One Bilgisayar', 'price': 12491.21, 'url': '', 'merchantName': 'hepsiburada'}
DEBUG: Scraped from <200;?_random_number=1663507557.5583432>
{'rowid': 557951834722614927, 'date': '2022-09-18 16:25:58.168075', 'listing_id': '76835cb4-74d3-498b-821a-58311752c934', 'product_id': 'hbcv0000065deb', 'product_name': 'Apple iMac M1 Çip 8GB 512GB SSD macOS Retina 24" FHD All In One Bilgisayar MGPJ3TU/A Yeşil', 'price': 34998.99, 'url': '', 'merchantName': 'hepsiburada'}
DEBUG: Scraped from <200;?_random_number=1663507557.5583432>
{'rowid': 2200588215358298971, 'date': '2022-09-18 16:25:58.168075', 'listing_id': '17fd6809-afa9-4e21-a772-509864d9bf28', 'product_id': 'hbcv000014z6fy', 'product_name': 'MSI MODERN AM241 11M-298TR Intel Pentium 7505 4GB 128GB SSD Windows 10 Pro 23.8" All In One Bilgisayar', 'price': 15335.09, 'url': '', 'merchantName': 'hepsiburada'}
DEBUG: Scraped from <200;?_random_number=1663507557.5583432>
{'rowid': 2433557015268455354, 'date': '2022-09-18 16:25:58.170501', 'listing_id': '0f0e1577-f1c2-4df5-ae9c-6c754317e998', 'product_id': 'hbcv00001eo94e', 'product_name': 'MSI MODERN AM271P 11M-021XTR Intel Core i7 1165G7 16GB 512GB SSD Freedos 27" FHD All In One Bilgisayar', 'price': 26754.38, 'url': '', 'merchantName': 'hepsiburada'}
Scraped from <200;?_random_number=1663507557.5583432>
Answered By - SuperUser
Post a Comment
Note: Only a member of this blog may post a comment.