Issue
I was scraping a site with Scrapy and Python. The code is producing seven unexpected errors. This is my code:
from scrapy import Spider
from scrapy.http import Request
import re
import pymysql
import sys
class EventSpider(Spider):
name = 'event' #name of the spider
allowed_domains = ['....com']
start_urls = ['http://....com/...',
'http://....com/....',
'http://....com/.....',
'http://.....com/.....',
'http://www.....com/....',
'http://www.....com/....',
'http://www....com/.....',
'http://www.....com/....',
'http://www......com/....',
'http://www......com/....',
'http://www......com/....',
'http://www......com/...',
'http://www......com/....',
'http://www......com/....',
'http://www......com/...',
'http://www.....com/.....',
'http://www......com/.....']
def parse(self, response):
events = response.xpath('//h2/a/@href').extract()
#events = response.xpath('//a[@class = "event-overly"]').extract()
for event in events:
absolute_url = response.urljoin(event)
yield Request(absolute_url, callback = self.parse_event)
def parse_event(self, response):
title = response.xpath('//title/text()').extract()
''' date = response.xpath('//div/p/text()')[0]. extract()
start_date = re.search("^[0-9]{1,2}\s[A-Z][a-z]{2}(,)\s[0-9]{4}",date)
if(start_date==None):
start_date2 =''
else:
start_date2 = start_date.group(0)
#end_date = response.xpath('//div/p/text()')[0]. extract()
end_date = re.search("\s[0-9]{1,2}\s[A-Z][a-z]{2}(,)\s[0-9]{4}", date)
if(end_date==None):
end_date2=''
else:
end_date2=end_date.group(0)'''
#email = response.xpath('//*[@id="more-email-with-dots"]/@value').extract_first()
#email_final = re.findall("[a-zA-Z0-9_.+-]+@(?!....)[\.[a-zA-Z0-9-.]+",email)
description = response.xpath('//*[@class = "events-discription-block"]//*/text()').extract()
description1 = [w.replace('\r\n', '') for w in description]
description2 = ",".join(description1)
''' time = response.xpath('//div/p/text()')[1]. extract()
end_time = re.search("\s[0-9]{1,2}(:)[0-9]{1,2}(:)[0-9]{1,2}", time)
if(end_time==None):
end_time2=''
else:
end_time2=end_time.group(0)
start_time = re.search("^[0-9]{1,2}(:)[0-9]{1,2}(:)[0-9]{1,2}", time)
if(start_time==None):
start_time2=''
else:
start_time2=start_time.group(0) '''
venue = response.xpath('//*[@id ="more-text-with-dots"]/@value').extract_first()
pin = re.search("\s[0-9]{6}", venue)
if(pin==None):
pin2 = ''
else:
pin2 = pin.group(0)
connection = pymysql.connect (host = "localhost", user = "root", passwd = "Iam90#honest", db = "city_details")
cursor = connection.cursor ()
cursor.execute ("select city, state, country from cities_list")
data = cursor.fetchall ()
for row in data :
found = re.search(row[0], venue)
if(found!=None):
city = row[0]
state = row[1]
country = row[2]
break
else:
city = ''
state = ''
country = ''
creative = response.xpath('//img/@src')[1].extract()
yield{
'title': title,
#'start_date': start_date2,
#'end_date': end_date2,
#'start_time': start_time2,
#'end_time': end_time2,
'venue': venue,
'city': city,
'state': state,
'country': country,
'pin': pin2,
#'email': email_final,
'description': description2,
'creative': creative
}
and this is the stats:
2018-03-23 19:18:30 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 45819,
'downloader/request_count': 109,
'downloader/request_method_count/GET': 109,
'downloader/response_bytes': 1024848,
'downloader/response_count': 109,
'downloader/response_status_count/200': 90,
'downloader/response_status_count/301': 19,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 3, 23, 10, 18, 30, 182504),
'item_scraped_count': 64,
'log_count/DEBUG': 174,
'log_count/ERROR': 7,
'log_count/INFO': 8,
'memusage/max': 54501376,
'memusage/startup': 54501376,
'request_depth_max': 1,
'response_received_count': 90,
'scheduler/dequeued': 105,
'scheduler/dequeued/memory': 105,
'scheduler/enqueued': 105,
'scheduler/enqueued/memory': 105,
'spider_exceptions/TypeError': 7,
'start_time': datetime.datetime(2018, 3, 23, 10, 18, 13, 744056)}
2018-03-23 19:18:30 [scrapy.core.engine] INFO: Spider closed (finished)
More specially the error shown is: TypeError: expected string or bytes-like object. I couldn't figure out the error. I can't understand why and where it is producing the expected string or bytes -like object error.
Solution
As @FrankMartin said, the value of venue
is None
in that page and that is causing the error. You can check this easly following the next steps:
- Localice the URL that caused the error on the first line of the traceback:
https://www.eventsnow.com/events/9238-ipl-2018-srh-vs-royal-challengers-bangalore
- Open a scrapy shell:
scrapy shell https://www.eventsnow.com/events/9238-ipl-2018-srh-vs-royal-challengers-bangalore
- Check if the given xpath is able to found a value for this webpage
- If the result is empty that means it couldn't find it, maybe because you need a better xpath or because that webpage doesn't contain that information.
Code for point 3.
In [2]: response.xpath('//*[@id ="more-text-with-dots"]/@value').extract_first()
In [3]:
Answered By - VMRuiz
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.