Issue
Scraping and parsing Javascript pages in Playwright. There are about 100 URLs, but the process ends without completing all of them.
What could be the cause of this? The code is working so far.
Is the for syntax in the wrong place?
I would appreciate it if you could tell me if I am using async incorrectly.
Changed to current code. The following commands are executed in Scrapy. scrapy runspider kuti_info.py
import scrapy
import requests
from bs4 import BeautifulSoup
from time import sleep
from scrapy.selector import Selector
from playwright.sync_api import sync_playwright
import asyncio
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['xxxxxxx.jp']
start_urls = ['https://xxxxxxx.jp/']
def parse(self, response):
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
urls = response.xpath('//div[@class="salonName"]')
for url in urls:
yield response.follow(url=url.xpath('.//h3/a/@href').get(), callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
async def parse_shop(self, response):
try:
r = requests.get(response.url)
soup = BeautifulSoup(r.text, 'html.parser')
repo = soup.find('div', {'class': 'abbr uTxt'})
except:
pass
urls = response.xpath('//div[@class="viewMore"]/a/@href').get()
for url in [urls]:
newurls = response.urljoin(url) href="/therapistlist.php?id=!!!!"
yield response.follow(url=newurls, callback=self.parse_therapist)
# yield SeleniumRequest(url=str(newurls), screenshot=True, callback=self.parse_therapist, wait_time=2)
try:
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': repo.text
}
except:
pass
async def parse_therapist(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
idurls = selector.xpath('//li[@therapist_id]/a/@href').get()
# browser.close()
yield response.follow(url=idurls, callback=self.parse_thera_page)
async def parse_thera_page(self, response):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
print(response.url)
page.goto(response.url)
sleep(2)
html = page.content()
selector = Selector(text=html)
print(selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
# try:
# r = requests.get(response.url)
# soup = BeautifulSoup(r.text, 'html.parser')
# repo = soup.find('div', {'class': 'txt'})
# except:
# pass
yield {
'therapist_name': selector.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
# 'report': repo.text
}
Solution
I see .get()
in some places so it get only first item from list - ie. it gets first therapist from list ~250 therapists. And maybe this is the problem that you get less results.
I found that therapistlist.php?id=...
uses JavaScript to read all data as JSON from therapistlist.php?id=...&more
(with &more
at the end) and it render page. And this way I read therapistlist as JSON data without Playwright
so I get results much,much faster.
I get ~800 therapists in ~1 minute.
If you write data in CSV then you may have another problem.
In CSV all items must have the same columns - and if Scrapy
see {'therapist_name': ...}
with column therapist_name
which it doesn't have in shop data
then it skips it - and you may get file only with shops without therapists. I added field therapist_name
in shop data and now CSV saves also therapists.
import scrapy
from time import sleep
from scrapy.selector import Selector
class KutiSpider(scrapy.Spider):
name = 'kuti'
allowed_domains = ['men-esthe.jp']
start_urls = ['https://men-esthe.jp/']
def parse(self, response):
print('[parse] url:', response.url)
urls = response.xpath('//ul[@class="areaList"]/a/@href')[0].get()
print('[parse] len(urls):', len(urls), type(urls))
yield response.follow(url=urls, callback=self.parse_area)
# urls = response.xpath('//ul[@class="areaList"]')
# for url in urls:
# yield response.follow(url=url.xpath('.//a/@href').get(), callback=self.parse_area)
def parse_area(self, response):
print('[parse_area] url:', response.url)
urls = response.xpath('//div[@class="salonName"]')
print('[parse_area] len(urls):', len(urls), type(urls))
for url in urls:
url = url.xpath('.//h3/a/@href').get()
yield response.follow(url, callback=self.parse_shop)
# next_page = response.xpath('//div[@class="pager"]//li/a[contains(text(), "次へ")]/@href').get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse_area)
def parse_shop(self, response):
print('[parse_shop] url:', response.url)
urls = response.xpath('//div[@class="viewMore"]/a/@href')
print('[parse_shop] len(urls):', len(urls), type(urls))
for url in urls.getall():
print('[parse_shop] url:', url)
yield response.follow(url=url + '&more', callback=self.parse_therapist)
yield {
'shop_name': response.xpath('//span[@class="now"]/a/span/text()').get(),
'shop_url': response.xpath('//dd/a/@href').get(),
'area': response.xpath('//div[@class="basicInfo"]/dl/dt[contains(text(), "エリア")]/following-sibling::dd/text()').get(),
'report-therapi-name': response.xpath('//div[@class="heading"]//span[@class="thName"]/a[1]/text()').get(),
'report': response.css('div.abbr.uTxt').text,
'therapist_name': "",
}
def parse_therapist(self, response):
print('[parse_therapist] url:', response.url)
data = response.json()
for item in data:
url = '/therapist.php?id=' + item['id']
yield response.follow(url=url, callback=self.parse_thera_page)
def parse_thera_page(self, response):
print('[parse_thera_page] url:', response.url)
print('now:', response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()'))
yield {
'shop_name': '',
'shop_url': '',
'area': '',
'report-therapi-name': '',
'report': '',
'therapist_name': response.xpath('//p[@class="TopicPath"]/span[@class="now"]/a/span/text()').get(),
}
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.