Thursday, March 24, 2022

[FIXED] Getting all text from a page and no pages crawled

March 24, 2022 scrapy, web-scraping No comments

Issue

I have created a scraper that grabs specific elements from a web-page. The website provides the option to go into all the artists in the webpage, so I can directly get all the artists from this page as there is no 'next-page' href provided by the website. My issue is that when I load all the websites into requests it crawls nothing, however when I reduce the list of webpages it will begin to crawl pages. Any ideas as to what is causing this issue?

Furthermore, I want to grab all the lyrics form the song-page. However, some lyrics are spaced out between a tags, whilst others are a single string. However, at times I get no lyrics even though when I click the direct url the webpage has lyrics. How can I grab all the text regardless and get the lyrics to all songs? If I include the following:

.//pre[@id='lyric-body-text']//a//text()

It still only grabs the first line of text under the a tag.

Here's an example of my scraper:

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader

class LyricalItem(scrapy.Item):
    artists = Field(output_processor = TakeFirst())
    songs = Field(output_processor = TakeFirst())
    duration = Field(output_processor = TakeFirst())
    album = Field(output_processor = TakeFirst())
    year = Field(output_processor = TakeFirst())
    lyrics = Field(output_processor = TakeFirst())


class LyricalSpider(scrapy.Spider):
    name = 'lyrical'
    
    artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

    start_urls = []
    for art in artists:
        start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
    custom_settings = {
        'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'DOWNLOAD_DELAY':0.5
    }


    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url, 
                callback = self.parse
            )

    async def parse(self, response):
        container = response.xpath("//table[@class='tdata']//tbody//tr")
        for artists in container:
            loader = ItemLoader(LyricalItem(), selector = artists)
            loader.add_xpath('artists', '(.//a)[position() mod 2 = 1]//text()')
            links = artists.xpath("(.//a)[position() mod 2 = 1]//@href").get()
            yield response.follow(
                url = response.urljoin(links),
                callback = self.parse_artists,
                cb_kwargs = {
                    'loader':loader
                }
            )

    def parse_artists(self, response, loader):
        #table = response.xpath("//div[@class='tdata-ext']//table")
        #for items in table:
            #loader = ItemLoader(LyricalItem(), selector = items
        if loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get()):
            loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get())
        else:
            loader.add_value('album', "Unkown Album")

        if loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get()):
            loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get())
        else:
            loader.add_value('year', 'Unknown Year')
        
        loader.add_value('songs', response.xpath("(.//td)[position() mod 2=1]//text()").get())
        loader.add_value('duration', response.xpath("(.//td)[position() mod 2=0]/text()").get())
        yield loader.load_item()

        get_lyrics = response.xpath("(.//td)[position() mod 2=1]//@href").get()
        yield response.follow(
            url= response.urljoin(get_lyrics),
            callback = self.get_lyrical,
            cb_kwargs = {
                'loader':loader
            }
        )
    def get_lyrical(self, response, loader):
        loader.add_value('lyrics', response.xpath(".//pre[@id='lyric-body-text']//text()").get())
        yield loader.load_item()



process = CrawlerProcess(
    settings = {
        #'CONCURRENT_REQUESTS':64,
        'FEED_URI':'artists.jl',
        'FEED_FORMAT':'jsonlines'
    }
)
process.crawl(LyricalSpider)
process.start()

Solution

Your code has quite a lot of redundant snippets. I have removed the redundant code and also implemented your request to have all the lyrics captured. Also all the information is available on the lyrics page so there's no need to pass the loader item around. You can simply crawl all the information from the lyrics page.

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, Join
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader

class LyricalItem(scrapy.Item):
    artist = Field()
    song = Field()
    duration = Field()
    album = Field()
    year = Field()
    lyrics = Field(output_processor = Join(" "))


class LyricalSpider(scrapy.Spider):
    name = 'lyrical'
    
    artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

    start_urls = []
    for art in artists:
        start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
    custom_settings = {
        'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'DOWNLOAD_DELAY':0.5
    }

    def parse(self, response):
        for artist in response.xpath("//table[@class='tdata']//tr/td/strong/a")[:2]:  
            yield response.follow(artist, callback = self.parse_artists)

    def parse_artists(self, response):
        for song in response.xpath("//table[@class='tdata']/tbody/tr/td/strong/a"):
            yield response.follow(song, callback = self.get_lyrical)

    def get_lyrical(self, response):
        loader = ItemLoader(LyricalItem(), response=response)
        loader.default_output_processor = TakeFirst()
        loader.add_xpath("album", "//h4[contains(text(),'more tracks from the album')]/following-sibling::h3/a/text()")
        loader.add_value("album", "Uknown album")
        loader.add_value("artist", response.xpath("normalize-space(//h3[@class='lyric-artist'])").get())
        loader.add_value("artist", "Uknown artist")
        loader.add_xpath("song", "//h1[@id='lyric-title-text']/text()")
        loader.add_xpath("year", "//dt[contains(text(),'Year:')]/following-sibling::dd/a/text()")
        loader.add_xpath("duration", "//dt[i[@title='Duration']]/following-sibling::dd/text()")
        for line in response.xpath("//*[@id='lyric-body-text']/descendant-or-self::*/text()").getall():
            loader.add_value('lyrics', line.strip())
        yield loader.load_item()



process = CrawlerProcess(
    settings = {
        #'CONCURRENT_REQUESTS':64,
        'FEEDS': {
            "songs.jl": {
                "format": "jsonlines"
            }
        }
    }
)
process.crawl(LyricalSpider)
process.start()

Answered By - msenior_

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Thursday, March 24, 2022

[FIXED] Getting all text from a page and no pages crawled

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels