Issue
I'm using this website to get latitude and longitude of different cities: https://www.latlong.net/.
Here is my code:
import scrapy
import json
with open('C:/Users/coppe/tutorial/cities.json') as json_file:
cities = json.load(json_file)
class communes_spider(scrapy.Spider):
name = "geo"
start_urls = ['https://www.latlong.net/']
def parse(self, response):
for city in cities:
yield scrapy.FormRequest.from_response(response, formid='place', formdata={'place': city['city']}, callback=self.get_geo)
def get_geo(self, response):
yield {'coord': response.css('input::text').get()}
The code run totally fine but the output I get is not correct. The default output value is (0,0) and should be something like (50.643909, 5.571560) after the form. However the crawler still gathers (0,0) as an answer. I guess the issue comes from the website but I cannot identify it.
JSON sample:
[{"city": "Anvers, BE"},
{"city": "Gand, BE"},
{"city": "Charleroi, BE"},
{"city": "Li\u00e8ge, BE"},
{"city": "Ville de Bruxelles, BE"},
{"city": "Schaerbeek, BE"},
{"city": "Anderlecht, BE"},
{"city": "Bruges, BE"},
{"city": "Namur, BE"},
{"city": "Louvain, BE"},
{"city": "Molenbeek-Saint-Jean, BE"}]
Solution
You can try this code, this is working in my side:
# -*- coding: utf-8 -*-
import re
import json
import scrapy
class communes_spider(scrapy.Spider):
name = "geo"
allowed_domains = ["www.latlong.net"]
start_urls = ['https://www.latlong.net/']
custom_settings = {
'COOKIES_ENABLED': True,
}
# This regex is not perfect and can be improved
LAT_LONG_REGEX = 'sm\((?P<lat>-?\d+\.?\d+),(?P<long>-?\d+\.?\d+)'
def start_requests(self):
FILE_PATH = 'C:/Users/coppe/tutorial/cities.json'
with open(FILE_PATH) as json_file:
cities_data = json.load(json_file)
for d in cities_data:
yield scrapy.Request(
url='https://www.latlong.net/',
callback=self.gen_csrftoken,
meta={'city': d['city']},
dont_filter=True, # Allow to request multiple time the same URL
)
def gen_csrftoken(self, response):
city = response.meta['city']
yield scrapy.FormRequest.from_response(
response,
formid='frmPlace',
formdata={'place': city},
callback=self.get_geo,
meta={'city': city}
)
def get_geo(self, response):
lat_long_search = re.search(self.LAT_LONG_REGEX, response.body.decode('utf-8'))
if lat_long_search:
yield {
'coord': (lat_long_search.group('lat'), lat_long_search.group('long')),
'city': response.meta['city']
}
else:
# Something is wrong, you can investigate with `inspect_response`
from scrapy.shell import inspect_response
inspect_response(response, self)
The reason why you find (0, 0) is because the lat/long coordinates are displayed trough javascript (they are populated from the backend inside the template). Scrapy cannot execute javascript without Splash.
So basically what we are doing, is parsing the JS Script with Regex in order to find the lat/long values.
Answered By - Sewake
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.