Friday, November 24, 2023

[FIXED] Scrapy: cannot reach second callback function when sending request with query parameter strings

November 24, 2023 python, scrapy, web-scraping No comments

Issue

I am scraping engineering blogs from meta. Right now I am just trying to print the title and url for each blog. Thanks for any help

here is what I have done. It doesn't reach parse_loadmore function and doesn't print anything. I have tried copy and paste loadmore_endpoint to the browser and it just works fine, which is supposed to be some html code.

import scrapy
from urllib.parse import urlencode
import pdfkit
import requests
import re
import json
from bs4 import BeautifulSoup
# from ..helpers import generate_pdfs_file_path

options = {
    # 'no-images': None,
    "disable-javascript": None,
    "disable-external-links": None,
    "quiet": None,
    "encoding": "UTF-8",
}


class MetaSpider(scrapy.Spider):
    name = "meta_spider"
    api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
    start_urls = [
        "https://engineering.fb.com/category/core-infra/",
        # "https://engineering.fb.com/category/data-infrastructure/",
        # "https://engineering.fb.com/category/developer-tools/",
        # "https://engineering.fb.com/category/production-engineering/",
        # "https://engineering.fb.com/category/security/",
    ]
    post_fetched = 0

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse_initial)
    def parse_initial(self, response):
        endpoint, query_args = get_loadmore_endpoints_and_params(response)
        for page in range(4):
            params = {
                "action": "loadmore",
                "queryArgs": json.dumps(query_args),
                "page": page,
                "post_type": "post",
            }
            loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
            # print(f"Sending Request {loadmore_endpoint}")
            yield scrapy.Request(url=loadmore_endpoint,  callback=self.parse_loadmore)

    def parse_loadmore(self, response):
        print("parse_loadmore called with response: {}".format(response.text))
        # Create a TextResponse object
        for post in response.css("article.post"):
            header = post.css("header.entry-header")
            title = header.css(".entry-title a::text").get().strip()
            url = header.css(".entry-title a::attr(href)").get()

            # Sanitize the title to create a valid filename
            safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
            print(f"----title: {safe_title}, url: {url}----")
  


def clean_post_html(soup):
    for script in soup.find_all("script"):
        script.decompose()
    for script in soup.find_all("noscript"):
        script.decompose()
    for element in soup.find_all(class_="sharedaddy"):
        element.decompose()

    image_container = soup.find(id="post-feat-image-container")
    if image_container:
        image_container.decompose()


def get_loadmore_endpoints_and_params(response):
    # Extracting the script content
    script_content = response.xpath(
        '//script[contains(., "loadmore_params")]/text()'
    ).get()

    # Parsing the JavaScript to extract query parameters
    if script_content:
        # Use regular expression to find the JSON object
        params_json = re.search(r"var loadmore_params = (.*?);", script_content)
        if params_json:
            params_string = params_json.group(1)
            params = json.loads(params_string)
            return params["restfulURL"], params["posts"]


def get_load_more_posts_url(url, params):
    query_string = urlencode(params, doseq=True)
    return f"{url}?{query_string}"

Solution

There are 2 things needed to achieve your goal.

In the settings.py or in your spiders custom_settings attribute set the default "URLLENGTH_LIMIT" to a higher value than the default - reason for this is because the load more endpoint is a very long URL and exceeds the limits imposed by scrapy by default
The response from your parse_loadmore method is typed as json, so it won't let you run css selectors on it. So the solution would be to initially call response.json() to get the text, and then manually stick the text into a scrapy.Selector and use that to run css and xpath queries on the html inside the string.

For example:

import scrapy
from urllib.parse import urlencode

import re
import json
# from ..helpers import generate_pdfs_file_path

options = {
    # 'no-images': None,
    "disable-javascript": None,
    "disable-external-links": None,
    "quiet": None,
    "encoding": "UTF-8",
}


class MetaSpider(scrapy.Spider):
    name = "meta_spider"
    api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
    start_urls = [
        "https://engineering.fb.com/category/core-infra/",
        # "https://engineering.fb.com/category/data-infrastructure/",
        # "https://engineering.fb.com/category/developer-tools/",
        # "https://engineering.fb.com/category/production-engineering/",
        # "https://engineering.fb.com/category/security/",
    ]
    post_fetched = 0
    custom_settings = {
        "URLLENGTH_LIMIT" : 20000
    }

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse_initial)
    def parse_initial(self, response):
        endpoint, query_args = get_loadmore_endpoints_and_params(response)
        for page in range(4):
            params = {
                "action": "loadmore",
                "queryArgs": json.dumps(query_args),
                "page": page,
                "post_type": "post",
            }
            loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
            yield scrapy.Request(url=loadmore_endpoint,  callback=self.parse_loadmore)

    def parse_loadmore(self, response):
        # print("parse_loadmore called with response: {}".format(response.text))

        resp = scrapy.Selector(text=response.json())

        for post in resp.css("article.post"):

            header = post.css("header.entry-header")
            title = header.css(".entry-title a::text").get().strip()
            url = header.css(".entry-title a::attr(href)").get()

            # Sanitize the title to create a valid filename
            safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
            print(f"----title: {safe_title}, url: {url}----")



def clean_post_html(soup):
    for script in soup.find_all("script"):
        script.decompose()
    for script in soup.find_all("noscript"):
        script.decompose()
    for element in soup.find_all(class_="sharedaddy"):
        element.decompose()

    image_container = soup.find(id="post-feat-image-container")
    if image_container:
        image_container.decompose()


def get_loadmore_endpoints_and_params(response):
    # Extracting the script content
    script_content = response.xpath(
        '//script[contains(., "loadmore_params")]/text()'
    ).get()
    # Parsing the JavaScript to extract query parameters
    if script_content:
        # Use regular expression to find the JSON object
        params_json = re.search(r"var loadmore_params = (.*?);", script_content)
        if params_json:
            params_string = params_json.group(1)
            params = json.loads(params_string)
            return params["restfulURL"], params["posts"]


def get_load_more_posts_url(url, params):
    query_string = urlencode(params, doseq=True)
    return f"{url}?{query_string}"

PARTIAL OUTPUT

2023-11-23 19:42:48 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/category/core-infra/> (referer: None)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22%5C%22%2C%5C%22m%5C%22%3A%5C%22%5C%22%2C%5C%22p%5C%22%3A0%2C%5C%22post_parent%5C%22%3A%5C%22%5C%22%2C%5C%22subpost%5C%22%3A%5C%22%5C%22%2C%5C%22subpost_id%5C%22%3A%5C%22%5C%22%2C%5C%22attachment%5C%22%3A%5C....5C%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=3&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22%5C%22%2C%5C%22m%5C%22%3A%5C%22%5C%22%2C%5C%22p%5C%22%3A0%2C%5C%22post_parent%5C%22%3A%5C%22%5C%22%2C%5C%22subpost%5C%22%3A%5C%22%5C%22%2C%5C%22subpost_id%5C%22%3A%5C%22%5C%22%2C%5C%22attachment%5C%22%3A%5C....%22&page=0&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22...e%5C%22%3A%5C%2250%5C%22%2C%5C%22no_found_rows%5C%22%3Afalse%2C%5C
%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=1&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22%5C...2%3Atrue%2C%5C%22post_type%5C%22%3A%5C%22%5C%22%2C%5C%22posts_per_page%5C%22%3A12%2C%5C%22nopaging%5C%22%3Afalse%2C%5C%22comments_per_page%5C%22%3A%5C%2250%5C%22%2C%5C%22no_found_rows%5C%22%3Afalse%2C%5C
%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=2&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
----title: Introducing_Velox_An_open_source_unified_execution_engine, url: https://engineering.fb.com/2023/03/09/open-source/velox-open-source-execution-engine/----
----title: Metas_head_of_AR_hardware_on_the_future_of_AR, url: https://engineering.fb.com/2023/02/24/virtual-reality/ar-vr-meta-caitlin-kalinowski/----
----title: How_Meta_brought_AV1_to_Reels, url: https://engineering.fb.com/2023/02/21/video-engineering/av1-codec-facebook-instagram-reels/----
----title: Inside_Metas_first_smart_glasses, url: https://engineering.fb.com/2023/02/16/virtual-reality/developing-meta-rayban-stories/----
----title: Building_a_cross-platform_runtime_for_AR, url: https://engineering.fb.com/2023/02/13/virtual-reality/meta-ar-augmented-reality-cross-platform-runtime/----
----title: Improving_Metas_global_maps, url: https://engineering.fb.com/2023/02/07/web/basemap-facebook-instagram-whatsapp-improvements/----
----title: The_evolution_of_Facebooks_iOS_app_architecture, url: https://engineering.fb.com/2023/02/06/ios/facebook-ios-app-architecture/----
----title: Asynchronous_computing_at_Meta_Overview_and_learnings, url: https://engineering.fb.com/2023/01/31/production-engineering/meta-asynchronous-computing/----
----title: Watch_Metas_engineers_discuss_optimizing_large-scale_networks, url: https://engineering.fb.com/2023/01/27/networking-traffic/optimizing-large-scale-networks-meta-engineers/----
----title: Tulip_Modernizing_Metas_data_platform, url: https://engineering.fb.com/2023/01/26/data-infrastructure/tulip-modernizing-metas-data-platform/----
----title: Open-sourcing_Anonymous_Credential_Service, url: https://engineering.fb.com/2022/12/12/security/anonymous-credential-service-acs-open-source/----
----title: Enabling_static_analysis_of_SQL_queries_at_Meta, url: https://engineering.fb.com/2022/11/30/data-infrastructure/static-analysis-sql-queries/----
----title: Writing_and_linting_Python_at_scale, url: https://engineering.fb.com/2023/11/21/production-engineering/writing-linting-python-at-scale-meta/----
----title: Watch_Metas_engineers_on_building_network_infrastructure_for_AI, url: https://engineering.fb.com/2023/11/15/networking-traffic/watch-metas-engineers-on-building-network-infrastructure-for-ai/----
----title: Enhancing_the_security_of_WhatsApp_calls, url: https://engineering.fb.com/2023/11/08/security/whatsapp-calls-enhancing-security/----
----title: How_Meta_built_Threads_in_5_months, url: https://engineering.fb.com/2023/11/06/android/how-meta-built-threads-in-5-months/----
----title: Automating_data_removal, url: https://engineering.fb.com/2023/10/31/data-infrastructure/automating-data-removal/----
----title: Automating_dead_code_cleanup, url: https://engineering.fb.com/2023/10/24/data-infrastructure/automating-dead-code-cleanup/----
----title: 5_Things_you_didnt_know_about_Buck2, url: https://engineering.fb.com/2023/10/23/developer-tools/5-things-you-didnt-know-about-buck2/----
----title: How_Meta_is_creating_custom_silicon_for_AI, url: https://engineering.fb.com/2023/10/18/ml-applications/meta-ai-custom-silicon-olivia-wu/----
----title: Automating_product_deprecation, url: https://engineering.fb.com/2023/10/17/data-infrastructure/automating-product-deprecation-meta/----
----title: Meta_contributes_new_features_to_Python_312, url: https://engineering.fb.com/2023/10/05/developer-tools/python-312-meta-new-features/----
----title: Meta_Quest_2_Defense_through_offense, url: https://engineering.fb.com/2023/09/12/security/meta-quest-2-defense-through-offense/----
----title: Using_Chakra_execution_traces_for_benchmarking_and_network_performance_optimization, url: https://engineering.fb.com/2023/09/07/networking-traffic/chakra-execution-traces-benchmarking-network-performance-op
timization/----
----title: Arcadia_An_end-to-end_AI_system_performance_simulator, url: https://engineering.fb.com/2023/09/07/data-infrastructure/arcadia-end-to-end-ai-system-performance-simulator/----
----title: Threads_The_inside_story_of_Metas_newest_social_app, url: https://engineering.fb.com/2023/09/07/culture/threads-inside-story-metas-newest-social-app/----
----title: What_is_it_like_to_write_code_at_Meta, url: https://engineering.fb.com/2023/09/05/web/what-like-ship-code-meta-tech-podcast/----
----title: Scheduling_Jupyter_Notebooks_at_Meta, url: https://engineering.fb.com/2023/08/29/security/scheduling-jupyter-notebooks-meta/----
----title: Code_Llama_Metas_state-of-the-art_LLM_for_coding, url: https://ai.meta.com/blog/code-llama-large-language-model-coding/----
----title: Introducing_Immortal_Objects_for_Python, url: https://engineering.fb.com/2023/08/15/developer-tools/immortal-objects-for-python-instagram-meta/----
----title: Meta_Connect_2023_September_27__28, url: https://www.meta.com/blog/quest/connect-2023-september-27-28-menlo-park-vr-ai----
----title: Scaling_the_Instagram_Explore_recommendations_system, url: https://engineering.fb.com/2023/08/09/ml-applications/scaling-instagram-explore-recommendations-system/----
----title: How_Meta_is_improving_password_security_and_preserving_privacy, url: https://engineering.fb.com/2023/08/08/security/how-meta-is-improving-password-security-and-preserving-privacy/----
----title: Fixit_2_Metas_next-generation_auto-fixing_linter, url: https://engineering.fb.com/2023/08/07/developer-tools/fixit-2-linter-meta/----
----title: Using_short-lived_certificates_to_protect_TLS_secrets, url: https://engineering.fb.com/2023/08/07/security/short-lived-certificates-protect-tls-secrets/----
----title: Bringing_HDR_video_to_Reels, url: https://engineering.fb.com/2023/07/17/video-engineering/hdr-video-reels-meta/----
----title: Metas_Evenstar_is_transitioning_to_OCP_to_accelerate_open_RAN_adoption, url: https://engineering.fb.com/2023/06/29/connectivity/evenstar-meta-ocp-open-ran/----
----title: Meta_developer_tools_Working_at_scale, url: https://engineering.fb.com/2023/06/27/developer-tools/meta-developer-tools-open-source/----
----title: Bombyx_is_being_licensed_for_product_development, url: https://engineering.fb.com/2023/05/22/connectivity/bombyx-meta-fiber-deployment-robot-product-development/----
----title: MSVP_is_Metas_first_video_processing_ASIC, url: https://ai.facebook.com/blog/meta-scalable-video-processor-MSVP----
----title: Meta_introduces_its_first-generation_AI_inference_accelerator, url: https://ai.facebook.com/blog/meta-training-inference-accelerator-AI-MTIA----
----title: Building_and_deploying_MySQL_Raft_at_Meta, url: https://engineering.fb.com/2023/05/16/data-infrastructure/mysql-raft-meta/----
----title: The_malware_threat_landscape_NodeStealer_DuckTail_and_more, url: https://engineering.fb.com/2023/05/03/security/malware-nodestealer-ducktail/----
----title: A_fine-grained_network_traffic_analysis_with_Millisampler, url: https://engineering.fb.com/2023/04/17/networking-traffic/millisampler-network-traffic-analysis/----
----title: Deploying_key_transparency_at_WhatsApp, url: https://engineering.fb.com/2023/04/13/security/whatsapp-key-transparency/----
----title: How_Device_Verification_protects_your_WhatsApp_account, url: https://engineering.fb.com/2023/04/13/security/whatsapp-device-verification-protects-your-account/----
----title: Why_xHE-AAC_is_being_embraced_at_Meta, url: https://engineering.fb.com/2023/04/11/video-engineering/high-quality-audio-xhe-aac-codec-meta/----
----title: Build_faster_with_Buck2_Our_open_source_build_system, url: https://engineering.fb.com/2023/04/06/open-source/buck2-open-source-large-scale-build-system/----
2023-11-23 19:42:49 [scrapy.core.engine] INFO: Closing spider (finished)
2023-11-23 19:42:49 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13747,
 'downloader/request_count': 5,

Answered By - Alexander

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Friday, November 24, 2023

[FIXED] Scrapy: cannot reach second callback function when sending request with query parameter strings

Issue

Solution

PARTIAL OUTPUT

0 comments:

Post a Comment

Popular Posts

Labels