Issue
i'm creating a script that lists all bussiness from one website, it need's to scrape (Name,address,website,email,telephone number). And i got to part that i kinda can scrape email, but i have small problem, i can't just tell my script to take all of them, they are specyifc and need to contain[Biuro or Sekretariat or name part of website www.(namePart).com] and i kinda don't know how to do it. Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(@class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
urlem = response.urljoin(website)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
def parseEmail(self,response, params=None):
email = response.xpath('//a[contains(@href, "@")]/@href').get()
params['email'] = email
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance!
Solution
In your parseEmail
method, after extracting the email address, just check the extracted string like you would with any string.
For Example
from urllib.parse import urlsplit
def parseEmail(self,response, params=None):
email = response.xpath('//a[contains(@href, "@")]/@href').get()
netloc = urlsplit(response.url).netloc
if 'Biuro' in email or 'Sekretariat' in email:
params['email'] = email
elif any([(i in email) for i in netloc.split('.')[:-1] if i != 'www']):
params['email'] = email
yield params
Answered By - alexpdev
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.