Issue
I am scraping engineering blogs from meta. Right now I am just trying to print the title and url for each blog. Thanks for any help
here is what I have done. It doesn't reach parse_loadmore function and doesn't print anything. I have tried copy and paste loadmore_endpoint to the browser and it just works fine, which is supposed to be some html code.
import scrapy
from urllib.parse import urlencode
import pdfkit
import requests
import re
import json
from bs4 import BeautifulSoup
# from ..helpers import generate_pdfs_file_path
options = {
# 'no-images': None,
"disable-javascript": None,
"disable-external-links": None,
"quiet": None,
"encoding": "UTF-8",
}
class MetaSpider(scrapy.Spider):
name = "meta_spider"
api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
start_urls = [
"https://engineering.fb.com/category/core-infra/",
# "https://engineering.fb.com/category/data-infrastructure/",
# "https://engineering.fb.com/category/developer-tools/",
# "https://engineering.fb.com/category/production-engineering/",
# "https://engineering.fb.com/category/security/",
]
post_fetched = 0
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_initial)
def parse_initial(self, response):
endpoint, query_args = get_loadmore_endpoints_and_params(response)
for page in range(4):
params = {
"action": "loadmore",
"queryArgs": json.dumps(query_args),
"page": page,
"post_type": "post",
}
loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
# print(f"Sending Request {loadmore_endpoint}")
yield scrapy.Request(url=loadmore_endpoint, callback=self.parse_loadmore)
def parse_loadmore(self, response):
print("parse_loadmore called with response: {}".format(response.text))
# Create a TextResponse object
for post in response.css("article.post"):
header = post.css("header.entry-header")
title = header.css(".entry-title a::text").get().strip()
url = header.css(".entry-title a::attr(href)").get()
# Sanitize the title to create a valid filename
safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
print(f"----title: {safe_title}, url: {url}----")
def clean_post_html(soup):
for script in soup.find_all("script"):
script.decompose()
for script in soup.find_all("noscript"):
script.decompose()
for element in soup.find_all(class_="sharedaddy"):
element.decompose()
image_container = soup.find(id="post-feat-image-container")
if image_container:
image_container.decompose()
def get_loadmore_endpoints_and_params(response):
# Extracting the script content
script_content = response.xpath(
'//script[contains(., "loadmore_params")]/text()'
).get()
# Parsing the JavaScript to extract query parameters
if script_content:
# Use regular expression to find the JSON object
params_json = re.search(r"var loadmore_params = (.*?);", script_content)
if params_json:
params_string = params_json.group(1)
params = json.loads(params_string)
return params["restfulURL"], params["posts"]
def get_load_more_posts_url(url, params):
query_string = urlencode(params, doseq=True)
return f"{url}?{query_string}"
Solution
There are 2 things needed to achieve your goal.
In the
settings.py
or in your spiderscustom_settings
attribute set the default"URLLENGTH_LIMIT"
to a higher value than the default - reason for this is because the load more endpoint is a very long URL and exceeds the limits imposed by scrapy by defaultThe response from your parse_loadmore method is typed as json, so it won't let you run css selectors on it. So the solution would be to initially call
response.json()
to get the text, and then manually stick the text into ascrapy.Selector
and use that to run css and xpath queries on the html inside the string.
For example:
import scrapy
from urllib.parse import urlencode
import re
import json
# from ..helpers import generate_pdfs_file_path
options = {
# 'no-images': None,
"disable-javascript": None,
"disable-external-links": None,
"quiet": None,
"encoding": "UTF-8",
}
class MetaSpider(scrapy.Spider):
name = "meta_spider"
api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
start_urls = [
"https://engineering.fb.com/category/core-infra/",
# "https://engineering.fb.com/category/data-infrastructure/",
# "https://engineering.fb.com/category/developer-tools/",
# "https://engineering.fb.com/category/production-engineering/",
# "https://engineering.fb.com/category/security/",
]
post_fetched = 0
custom_settings = {
"URLLENGTH_LIMIT" : 20000
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_initial)
def parse_initial(self, response):
endpoint, query_args = get_loadmore_endpoints_and_params(response)
for page in range(4):
params = {
"action": "loadmore",
"queryArgs": json.dumps(query_args),
"page": page,
"post_type": "post",
}
loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
yield scrapy.Request(url=loadmore_endpoint, callback=self.parse_loadmore)
def parse_loadmore(self, response):
# print("parse_loadmore called with response: {}".format(response.text))
resp = scrapy.Selector(text=response.json())
for post in resp.css("article.post"):
header = post.css("header.entry-header")
title = header.css(".entry-title a::text").get().strip()
url = header.css(".entry-title a::attr(href)").get()
# Sanitize the title to create a valid filename
safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
print(f"----title: {safe_title}, url: {url}----")
def clean_post_html(soup):
for script in soup.find_all("script"):
script.decompose()
for script in soup.find_all("noscript"):
script.decompose()
for element in soup.find_all(class_="sharedaddy"):
element.decompose()
image_container = soup.find(id="post-feat-image-container")
if image_container:
image_container.decompose()
def get_loadmore_endpoints_and_params(response):
# Extracting the script content
script_content = response.xpath(
'//script[contains(., "loadmore_params")]/text()'
).get()
# Parsing the JavaScript to extract query parameters
if script_content:
# Use regular expression to find the JSON object
params_json = re.search(r"var loadmore_params = (.*?);", script_content)
if params_json:
params_string = params_json.group(1)
params = json.loads(params_string)
return params["restfulURL"], params["posts"]
def get_load_more_posts_url(url, params):
query_string = urlencode(params, doseq=True)
return f"{url}?{query_string}"
PARTIAL OUTPUT
2023-11-23 19:42:48 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/category/core-infra/> (referer: None)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22%5C%22%2C%5C%22m%5C%22%3A%5C%22%5C%22%2C%5C%22p%5C%22%3A0%2C%5C%22post_parent%5C%22%3A%5C%22%5C%22%2C%5C%22subpost%5C%22%3A%5C%22%5C%22%2C%5C%22subpost_id%5C%22%3A%5C%22%5C%22%2C%5C%22attachment%5C%22%3A%5C....5C%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=3&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22%5C%22%2C%5C%22m%5C%22%3A%5C%22%5C%22%2C%5C%22p%5C%22%3A0%2C%5C%22post_parent%5C%22%3A%5C%22%5C%22%2C%5C%22subpost%5C%22%3A%5C%22%5C%22%2C%5C%22subpost_id%5C%22%3A%5C%22%5C%22%2C%5C%22attachment%5C%22%3A%5C....%22&page=0&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22...e%5C%22%3A%5C%2250%5C%22%2C%5C%22no_found_rows%5C%22%3Afalse%2C%5C
%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=1&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
22%3A%5C%22%5C...2%3Atrue%2C%5C%22post_type%5C%22%3A%5C%22%5C%22%2C%5C%22posts_per_page%5C%22%3A12%2C%5C%22nopaging%5C%22%3Afalse%2C%5C%22comments_per_page%5C%22%3A%5C%2250%5C%22%2C%5C%22no_found_rows%5C%22%3Afalse%2C%5C
%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=2&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
----title: Introducing_Velox_An_open_source_unified_execution_engine, url: https://engineering.fb.com/2023/03/09/open-source/velox-open-source-execution-engine/----
----title: Metas_head_of_AR_hardware_on_the_future_of_AR, url: https://engineering.fb.com/2023/02/24/virtual-reality/ar-vr-meta-caitlin-kalinowski/----
----title: How_Meta_brought_AV1_to_Reels, url: https://engineering.fb.com/2023/02/21/video-engineering/av1-codec-facebook-instagram-reels/----
----title: Inside_Metas_first_smart_glasses, url: https://engineering.fb.com/2023/02/16/virtual-reality/developing-meta-rayban-stories/----
----title: Building_a_cross-platform_runtime_for_AR, url: https://engineering.fb.com/2023/02/13/virtual-reality/meta-ar-augmented-reality-cross-platform-runtime/----
----title: Improving_Metas_global_maps, url: https://engineering.fb.com/2023/02/07/web/basemap-facebook-instagram-whatsapp-improvements/----
----title: The_evolution_of_Facebooks_iOS_app_architecture, url: https://engineering.fb.com/2023/02/06/ios/facebook-ios-app-architecture/----
----title: Asynchronous_computing_at_Meta_Overview_and_learnings, url: https://engineering.fb.com/2023/01/31/production-engineering/meta-asynchronous-computing/----
----title: Watch_Metas_engineers_discuss_optimizing_large-scale_networks, url: https://engineering.fb.com/2023/01/27/networking-traffic/optimizing-large-scale-networks-meta-engineers/----
----title: Tulip_Modernizing_Metas_data_platform, url: https://engineering.fb.com/2023/01/26/data-infrastructure/tulip-modernizing-metas-data-platform/----
----title: Open-sourcing_Anonymous_Credential_Service, url: https://engineering.fb.com/2022/12/12/security/anonymous-credential-service-acs-open-source/----
----title: Enabling_static_analysis_of_SQL_queries_at_Meta, url: https://engineering.fb.com/2022/11/30/data-infrastructure/static-analysis-sql-queries/----
----title: Writing_and_linting_Python_at_scale, url: https://engineering.fb.com/2023/11/21/production-engineering/writing-linting-python-at-scale-meta/----
----title: Watch_Metas_engineers_on_building_network_infrastructure_for_AI, url: https://engineering.fb.com/2023/11/15/networking-traffic/watch-metas-engineers-on-building-network-infrastructure-for-ai/----
----title: Enhancing_the_security_of_WhatsApp_calls, url: https://engineering.fb.com/2023/11/08/security/whatsapp-calls-enhancing-security/----
----title: How_Meta_built_Threads_in_5_months, url: https://engineering.fb.com/2023/11/06/android/how-meta-built-threads-in-5-months/----
----title: Automating_data_removal, url: https://engineering.fb.com/2023/10/31/data-infrastructure/automating-data-removal/----
----title: Automating_dead_code_cleanup, url: https://engineering.fb.com/2023/10/24/data-infrastructure/automating-dead-code-cleanup/----
----title: 5_Things_you_didnt_know_about_Buck2, url: https://engineering.fb.com/2023/10/23/developer-tools/5-things-you-didnt-know-about-buck2/----
----title: How_Meta_is_creating_custom_silicon_for_AI, url: https://engineering.fb.com/2023/10/18/ml-applications/meta-ai-custom-silicon-olivia-wu/----
----title: Automating_product_deprecation, url: https://engineering.fb.com/2023/10/17/data-infrastructure/automating-product-deprecation-meta/----
----title: Meta_contributes_new_features_to_Python_312, url: https://engineering.fb.com/2023/10/05/developer-tools/python-312-meta-new-features/----
----title: Meta_Quest_2_Defense_through_offense, url: https://engineering.fb.com/2023/09/12/security/meta-quest-2-defense-through-offense/----
----title: Using_Chakra_execution_traces_for_benchmarking_and_network_performance_optimization, url: https://engineering.fb.com/2023/09/07/networking-traffic/chakra-execution-traces-benchmarking-network-performance-op
timization/----
----title: Arcadia_An_end-to-end_AI_system_performance_simulator, url: https://engineering.fb.com/2023/09/07/data-infrastructure/arcadia-end-to-end-ai-system-performance-simulator/----
----title: Threads_The_inside_story_of_Metas_newest_social_app, url: https://engineering.fb.com/2023/09/07/culture/threads-inside-story-metas-newest-social-app/----
----title: What_is_it_like_to_write_code_at_Meta, url: https://engineering.fb.com/2023/09/05/web/what-like-ship-code-meta-tech-podcast/----
----title: Scheduling_Jupyter_Notebooks_at_Meta, url: https://engineering.fb.com/2023/08/29/security/scheduling-jupyter-notebooks-meta/----
----title: Code_Llama_Metas_state-of-the-art_LLM_for_coding, url: https://ai.meta.com/blog/code-llama-large-language-model-coding/----
----title: Introducing_Immortal_Objects_for_Python, url: https://engineering.fb.com/2023/08/15/developer-tools/immortal-objects-for-python-instagram-meta/----
----title: Meta_Connect_2023_September_27__28, url: https://www.meta.com/blog/quest/connect-2023-september-27-28-menlo-park-vr-ai----
----title: Scaling_the_Instagram_Explore_recommendations_system, url: https://engineering.fb.com/2023/08/09/ml-applications/scaling-instagram-explore-recommendations-system/----
----title: How_Meta_is_improving_password_security_and_preserving_privacy, url: https://engineering.fb.com/2023/08/08/security/how-meta-is-improving-password-security-and-preserving-privacy/----
----title: Fixit_2_Metas_next-generation_auto-fixing_linter, url: https://engineering.fb.com/2023/08/07/developer-tools/fixit-2-linter-meta/----
----title: Using_short-lived_certificates_to_protect_TLS_secrets, url: https://engineering.fb.com/2023/08/07/security/short-lived-certificates-protect-tls-secrets/----
----title: Bringing_HDR_video_to_Reels, url: https://engineering.fb.com/2023/07/17/video-engineering/hdr-video-reels-meta/----
----title: Metas_Evenstar_is_transitioning_to_OCP_to_accelerate_open_RAN_adoption, url: https://engineering.fb.com/2023/06/29/connectivity/evenstar-meta-ocp-open-ran/----
----title: Meta_developer_tools_Working_at_scale, url: https://engineering.fb.com/2023/06/27/developer-tools/meta-developer-tools-open-source/----
----title: Bombyx_is_being_licensed_for_product_development, url: https://engineering.fb.com/2023/05/22/connectivity/bombyx-meta-fiber-deployment-robot-product-development/----
----title: MSVP_is_Metas_first_video_processing_ASIC, url: https://ai.facebook.com/blog/meta-scalable-video-processor-MSVP----
----title: Meta_introduces_its_first-generation_AI_inference_accelerator, url: https://ai.facebook.com/blog/meta-training-inference-accelerator-AI-MTIA----
----title: Building_and_deploying_MySQL_Raft_at_Meta, url: https://engineering.fb.com/2023/05/16/data-infrastructure/mysql-raft-meta/----
----title: The_malware_threat_landscape_NodeStealer_DuckTail_and_more, url: https://engineering.fb.com/2023/05/03/security/malware-nodestealer-ducktail/----
----title: A_fine-grained_network_traffic_analysis_with_Millisampler, url: https://engineering.fb.com/2023/04/17/networking-traffic/millisampler-network-traffic-analysis/----
----title: Deploying_key_transparency_at_WhatsApp, url: https://engineering.fb.com/2023/04/13/security/whatsapp-key-transparency/----
----title: How_Device_Verification_protects_your_WhatsApp_account, url: https://engineering.fb.com/2023/04/13/security/whatsapp-device-verification-protects-your-account/----
----title: Why_xHE-AAC_is_being_embraced_at_Meta, url: https://engineering.fb.com/2023/04/11/video-engineering/high-quality-audio-xhe-aac-codec-meta/----
----title: Build_faster_with_Buck2_Our_open_source_build_system, url: https://engineering.fb.com/2023/04/06/open-source/buck2-open-source-large-scale-build-system/----
2023-11-23 19:42:49 [scrapy.core.engine] INFO: Closing spider (finished)
2023-11-23 19:42:49 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13747,
'downloader/request_count': 5,
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.