Issue
I have created a scraper that grabs specific elements from a web-page. The website provides the option to go into all the artists
in the webpage, so I can directly get all the artists
from this page as there is no 'next-page' href provided by the website. My issue is that when I load all the websites into requests it crawls nothing, however when I reduce the list of webpages it will begin to crawl pages. Any ideas as to what is causing this issue?
Furthermore, I want to grab all the lyrics form the song-page. However, some lyrics are spaced out between a
tags, whilst others are a single string. However, at times I get no lyrics even though when I click the direct url the webpage has lyrics. How can I grab all the text regardless and get the lyrics to all songs? If I include the following:
.//pre[@id='lyric-body-text']//a//text()
It still only grabs the first line of text under the a
tag.
Here's an example of my scraper:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class LyricalItem(scrapy.Item):
artists = Field(output_processor = TakeFirst())
songs = Field(output_processor = TakeFirst())
duration = Field(output_processor = TakeFirst())
album = Field(output_processor = TakeFirst())
year = Field(output_processor = TakeFirst())
lyrics = Field(output_processor = TakeFirst())
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
async def parse(self, response):
container = response.xpath("//table[@class='tdata']//tbody//tr")
for artists in container:
loader = ItemLoader(LyricalItem(), selector = artists)
loader.add_xpath('artists', '(.//a)[position() mod 2 = 1]//text()')
links = artists.xpath("(.//a)[position() mod 2 = 1]//@href").get()
yield response.follow(
url = response.urljoin(links),
callback = self.parse_artists,
cb_kwargs = {
'loader':loader
}
)
def parse_artists(self, response, loader):
#table = response.xpath("//div[@class='tdata-ext']//table")
#for items in table:
#loader = ItemLoader(LyricalItem(), selector = items
if loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get()):
loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get())
else:
loader.add_value('album', "Unkown Album")
if loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get()):
loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get())
else:
loader.add_value('year', 'Unknown Year')
loader.add_value('songs', response.xpath("(.//td)[position() mod 2=1]//text()").get())
loader.add_value('duration', response.xpath("(.//td)[position() mod 2=0]/text()").get())
yield loader.load_item()
get_lyrics = response.xpath("(.//td)[position() mod 2=1]//@href").get()
yield response.follow(
url= response.urljoin(get_lyrics),
callback = self.get_lyrical,
cb_kwargs = {
'loader':loader
}
)
def get_lyrical(self, response, loader):
loader.add_value('lyrics', response.xpath(".//pre[@id='lyric-body-text']//text()").get())
yield loader.load_item()
process = CrawlerProcess(
settings = {
#'CONCURRENT_REQUESTS':64,
'FEED_URI':'artists.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(LyricalSpider)
process.start()
Solution
Your code has quite a lot of redundant snippets. I have removed the redundant code and also implemented your request to have all the lyrics captured. Also all the information is available on the lyrics page so there's no need to pass the loader item around. You can simply crawl all the information from the lyrics page.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, Join
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class LyricalItem(scrapy.Item):
artist = Field()
song = Field()
duration = Field()
album = Field()
year = Field()
lyrics = Field(output_processor = Join(" "))
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def parse(self, response):
for artist in response.xpath("//table[@class='tdata']//tr/td/strong/a")[:2]:
yield response.follow(artist, callback = self.parse_artists)
def parse_artists(self, response):
for song in response.xpath("//table[@class='tdata']/tbody/tr/td/strong/a"):
yield response.follow(song, callback = self.get_lyrical)
def get_lyrical(self, response):
loader = ItemLoader(LyricalItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("album", "//h4[contains(text(),'more tracks from the album')]/following-sibling::h3/a/text()")
loader.add_value("album", "Uknown album")
loader.add_value("artist", response.xpath("normalize-space(//h3[@class='lyric-artist'])").get())
loader.add_value("artist", "Uknown artist")
loader.add_xpath("song", "//h1[@id='lyric-title-text']/text()")
loader.add_xpath("year", "//dt[contains(text(),'Year:')]/following-sibling::dd/a/text()")
loader.add_xpath("duration", "//dt[i[@title='Duration']]/following-sibling::dd/text()")
for line in response.xpath("//*[@id='lyric-body-text']/descendant-or-self::*/text()").getall():
loader.add_value('lyrics', line.strip())
yield loader.load_item()
process = CrawlerProcess(
settings = {
#'CONCURRENT_REQUESTS':64,
'FEEDS': {
"songs.jl": {
"format": "jsonlines"
}
}
}
)
process.crawl(LyricalSpider)
process.start()
Answered By - msenior_
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.